package edu.cmu.geolocator.common; import java.util.ArrayList; import java.util.List; import org.apache.lucene.document.Document; import edu.cmu.geolocator.model.CandidateAndFeature; import edu.cmu.geolocator.model.LocEntityAnnotation; import edu.cmu.geolocator.nlp.tokenizer.EuroLangTwokenizer; import edu.cmu.geolocator.resource.ResourceFactory; import edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.InfoFields; public class TweetDisambUtil { public static boolean countryInTimezone(Document[] timeZone, CandidateAndFeature aFeature) { String candCountry = aFeature.getCountryCode(); for (Document atz : timeZone) { String atzCountry = atz.get(InfoFields.countryCode); if (candCountry.equals(atzCountry)) { return true; } } return false; } public static Document[] twitterTimezone2Country(String ttz) { if (ttz.length() == 0 || ttz == null) return null; if (ttz.endsWith("(US & Canada)")) { String full = ResourceFactory.getCountryCode2CountryMap().getValue("us").getAsciiName(); ArrayList<Document> us = ResourceFactory.getClbIndex().getDocumentsByPhrase(full); Document theUS = null; long pop = -1; for (Document eachUS : us) { long eachPop = Long.parseLong(eachUS.get("POPULATION")); if (eachPop > pop) { theUS = eachUS; pop = eachPop; } } ArrayList<Document> ca = ResourceFactory.getClbIndex().getDocumentsByPhrase("Canada"); Document theCA = null; pop = -1; for (Document eachCA : ca) { long eachPop = Long.parseLong(eachCA.get("POPULATION")); if (eachPop > pop) { theCA = eachCA; pop = eachPop; } } return new Document[] { theUS, theCA }; } if (ttz.endsWith("(Canada)")) { ArrayList<Document> ca = ResourceFactory.getClbIndex().getDocumentsByPhrase("Canada"); Document theCA = null; long pop = -1; for (Document eachCA : ca) { long eachPop = Long.parseLong(eachCA.get("POPULATION")); if (eachPop > pop) { theCA = eachCA; pop = eachPop; } } return new Document[] { theCA }; } else { ArrayList<Document> locs = ResourceFactory.getClbIndex().getDocumentsByPhrase(ttz); Document theloc = null; if (locs == null) return null; long pop = -1; for (Document loc : locs) { long eachpop = Long.parseLong(loc.get("POPULATION")); if (eachpop > pop) { theloc = loc; pop = eachpop; } } return new Document[] { theloc }; } } public static double getDocStringOverlap(CandidateAndFeature aFeature, String userInfo) { // TODO Auto-generated method stub ArrayList<String> dTokens = new ArrayList<String>(); String name = aFeature.getAsciiName(); // System.out.println(doc); //add alternative names into it. String id = aFeature.getId(); String[] altnames = ResourceFactory.getClbIndex().getAlternateNames(id); String country = "", countryCode = ""; countryCode = aFeature.getCountryCode(); if (ResourceFactory.getCountryCode2CountryMap().isInMap(countryCode)) country = ResourceFactory.getCountryCode2CountryMap().getValue(countryCode).getAsciiName(); List<String> userTokens = EuroLangTwokenizer.tokenize(userInfo.toLowerCase()); String[] temp = new String[]{country, countryCode}; for (String t : temp){ if (t==null ||t.length()==0)continue; dTokens.add(t.toLowerCase()); } for (String t : altnames){ if (t==null ||t.length()==0)continue; dTokens.add(t.toLowerCase()); } return StringUtil.getGramSimilarity(userTokens.toArray(new String[]{}), dTokens.toArray(new String[]{})); } }