package edu.cmu.geolocator.nlp.ner.FeatureExtractor;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.cmu.geolocator.common.StringUtil;
import edu.cmu.geolocator.model.Sentence;
import edu.cmu.geolocator.nlp.StanfordCoreTools.StanfordNLP;
import edu.cmu.geolocator.nlp.tokenizer.EuroLangTwokenizer;
import edu.cmu.geolocator.resource.dictionary.Dictionary;
import edu.cmu.geolocator.resource.trie.IndexSupportedTrie;
import edu.cmu.minorthird.classify.Feature;
import Wordnet.*;
public class AnotherFeatureGenerator extends FeatureGenerator {
HashSet<String> preposition, countries;
static HashMap<String, String> clusters;
Dictionary prepdict, countrydict;
static StanfordNLP snlp;
IndexSupportedTrie trie;
static ArrayList<String> naturalFeaturesList;
static public ArrayList<String> unnamedLocationsList;
static ArrayList<String> personNamesList;
static ArrayList<String> sportsTeamsList;
static ArrayList<String> namedOrganizationsList;
static ArrayList<String> namedOrgIndicatorList;
static ArrayList<String> spatialVerbsList;
static ArrayList<String> spatialRelationsList;
static ArrayList<String> spatialPrepsList;
static ArrayList<String> streetsuffixList;
static ArrayList<String> newsPaperList;
static ArrayList<String> numbersList;
static HashSet<String> toponymsList;
private static String sen;
public static void readAllLists() throws IOException {
namedOrganizationsList = readListFile("LNamedOrganization");
unnamedLocationsList = readListFile("LUnnamedLocation");
namedOrgIndicatorList = readListFile("LNamedOrgIndicator");
spatialVerbsList = readListFile("LSpatialVerbs");
spatialRelationsList = readListFile("LSpatialRelations");
personNamesList = readListFile("LPersonNames");
spatialPrepsList = readListFile("LSpatialPreps");
streetsuffixList = readListFile("LStreetSuffix");
sportsTeamsList = readListFile("LSportsTeams");
newsPaperList = readListFile("LNewsPapers");
numbersList = readListFile("LNumbers");
naturalFeaturesList = readListFile("LNaturalFeatures");
toponymsList = readSetFile("LAllCountries"); // LAllCountries");
/*
* System.out.println(LNaturalFeatures.get(2));
* System.out.println(LUnnamedLocations.get(2));
* System.out.println(LNamedOrganizations.get(2));
*/
}
public static boolean containsPartial(ArrayList<String> list, String word) {
for (String tmp : list)
if (tmp.contains(word))
return true;
return false;
}
public static ArrayList<String> readListFile(String FileName)
throws IOException {
ArrayList<String> list = new ArrayList<String>();
String filename = "res/lists/" + FileName + ".txt";
System.err.println("Reading file:" + filename);
BufferedReader reader = new BufferedReader(new FileReader(filename));
String line = null;
while ((line = reader.readLine()) != null) {
// Lower casing before adding to list
list.add(line.trim().toLowerCase());
}
reader.close();
return list;
}
public static void ReadBrownCluster(String filename) throws IOException {
clusters = new HashMap<String, String>();
String cluster = "", line = "", word = "";
int check = 0;
BufferedReader bw = new BufferedReader(new FileReader(filename));
while ((line = bw.readLine()) != null) {
word = line.split("\t")[1];
cluster = line.split("\t")[0];
// System.out.println("brownbrownw"+word+cluster);
clusters.put(word, cluster);
}
System.out.println("BC DONE");
}
public static HashSet<String> readSetFile(String FileName)
throws IOException {
HashSet<String> set = new HashSet<String>();
String filename = "res/Lists/" + FileName + ".txt";
System.err.println("Reading file:" + filename);
BufferedReader reader = new BufferedReader(new FileReader(filename));
String line = null;
while ((line = reader.readLine()) != null) {
// Lower casing before adding to list
set.add(line.trim().toLowerCase());
}
reader.close();
return set;
}
public AnotherFeatureGenerator() {
super();
// initialize dictionary to lookup.
// "geoNames.com/allCountries.txt"
snlp = new StanfordNLP();
// Stanford - Lemmatizer, tokenizer, NER, POS
if (unnamedLocationsList == null)
try {
readAllLists();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (clusters == null)
try {
ReadBrownCluster("res/brownclusters/paths");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
static int statstreet = 0;
static int statbuilding = 0;
static int stattoponym = 0;
static int statabbr = 0, statadj = 0;
String tweet;
public static ArrayList<String> FeaturelistGen(String sentence,
AnotherFeatureGenerator fgen) throws IOException {
ArrayList<String> FeatureList = new ArrayList<String>();
String[] simpleTokenizedData = null;
String[] tokenTags = null;
// String[] posTags=null,nerTags=null,lemma=null;
HashMap tagdata = new HashMap<String, String>();
// int length=0;
snlp.Tokenizer(sentence);
simpleTokenizedData = snlp.StringTokenizer(sentence);
StringBuffer bw = new StringBuffer();
if (simpleTokenizedData != null || simpleTokenizedData.length != 0) {
ArrayList<String> newTokens = new ArrayList<String>();
tokenTags = TokentoBIOTag(simpleTokenizedData, newTokens);
String[] tokenizedData = new String[newTokens.size()];
tokenizedData = newTokens.toArray(tokenizedData);
// Extract features
// bw.write(data);
List<Feature[]> tokenFeatures = fgen.extractFeature(tokenizedData);
// Write feature + tag for each token
for (int j = 0; j < tokenFeatures.size(); j++) {
bw = new StringBuffer();
initialFeatureWriter();
bw.append(tokenizedData[j] + " ");
// bw.write(tokenTags[j]);
for (Feature f : tokenFeatures.get(j)) {
append(f.toString());
bw.append(f.toString() + " ");
}
bw.append(" ");
// location class.
String loctag = tokenTags[j];
// append(loctag);
bw.append(loctag + " ");
// fwriter.write(emit());
bw.append("\n");
FeatureList.add(bw.toString());
}
// fwriter.write("\n");
return FeatureList;
}
else
return null;
}
public static void main(String argv[]) throws IOException,
InterruptedException {
AnotherFeatureGenerator fgen = new AnotherFeatureGenerator();
sen = "cross the United States";
ArrayList<String> FeatureList = new ArrayList<String>();
FeatureList = FeaturelistGen(sen, fgen);
for (String fn : FeatureList) {
System.out.println(fn + " ");
}
}
@Override
public List<Feature[]> extractFeature(Sentence tweetSentence) {
try {
return extractFeature(snlp.StringTokenizer(tweetSentence
.getSentenceString()));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
/**
* MAIN FUNCTION FOR EXTRACTIN FEATURES
*
* @param t_tweet
* @param trie
* @param postags
* @return FEATURE LISTS
* @throws IOException
*/
public List<Feature[]> extractFeature(String[] tokens) throws IOException {
List<List<Feature>> instances = new ArrayList<List<Feature>>(
tokens.length);
List<Feature> f = new ArrayList<Feature>();
String[] posTags = null, nerTags = null, lemma = null;
int length = tokens.length;
posTags = new String[length + 1];
nerTags = new String[length + 1];
lemma = new String[length + 1];
// Parse features
Map<String, String> parentEdge = new HashMap<String, String>();
Map<String, ArrayList<String>> childrenEdge = new HashMap<String, ArrayList<String>>();
ArrayList<String> npChunks = null;
snlp.DoAll(tokens, posTags, lemma, parentEdge, childrenEdge);
npChunks = snlp.NPChunker(tokens, posTags);
for (int i = 0; i < tokens.length; i++) {
// clear feature list for this loop
f = new ArrayList<Feature>();
genLemmaFeatures(f, tokens, lemma, posTags, i); // lemma & lower
// case
genBrownClusterFeatures(f, tokens, i);
// /////////////////////////////// MORPH FEATURES
// genTokenFeatures(f, lemmadata, i);
genCapFeatures(f, tokens, i);
// ////////////////////////////// SEMANTIC FEATURES
genPosFeatures(f, posTags, i);
// ////////////////////////////// SEMANTIC FEATURES
genChunkFeatures(f, tokens, posTags, npChunks, i);
// ///////////////////////////////// List Features
genLookupListFeatures(f, tokens, i);
genParseFeatures(f, tokens, parentEdge, childrenEdge, i);
genWordnetFeatures(f, tokens, i);
instances.add(f);
}
// convert array to output format.
ArrayList<Feature[]> newinstances = new ArrayList<Feature[]>();
for (int i1 = 0; i1 < instances.size(); i1++) {
newinstances.add(instances.get(i1).toArray(new Feature[] {}));
}
return newinstances;
}
private void genChunkFeatures(List<Feature> f, String[] tokens,
String[] posTags, ArrayList<String> npChunks, int i) {
// Last word OR last two words are in unnamed location list
int wasInNPChunks = 0;
int featval = 0, count = 0;
for (String np : npChunks) {
if (np.contains(tokens[i])) {
String[] chunkWords = np.split(" ");
wasInNPChunks = 1;
// Check for last word feature
String lastWord = np.substring(np.lastIndexOf(' ') + 1);
if (unnamedLocationsList.contains(lastWord.toLowerCase()))
featval = 1;
// Check for last 2 words
String[] npSplit = np.split(" ");
int length = npSplit.length;
String last2Words = npSplit[length - 2] + " "
+ npSplit[length - 1];
if (length >= 2) {
if (unnamedLocationsList.contains(last2Words.toLowerCase()))
featval = 1;
}
addFeature(f, "0_lword_Unloc_" + featval);
// First Letter Capitalized in each word of the chunk
featval = 0;
for (String cw : chunkWords) {
if (MPHCAPbool(cw)) {
count += 1;
}
}
if (count == chunkWords.length) {
featval = 1;
}
addFeature(f, "0_lword_FirstCap_" + featval);
// Last word OR last two words are in named organization
// indicator list
featval = 0;
if (namedOrgIndicatorList.contains(lastWord))
featval = 1;
addFeature(f, "0_lword_NamedOrgIndicator_" + featval);
// Last word OR last two words are in street list
featval = 0;
if (streetsuffixList.contains(lastWord))
featval = 1;
addFeature(f, "0_lword_StreetSuffix_" + featval);
// Word in the chunk is on the < features> list
/*
* featval=0;
*
* for (String cw : chunkWords) {
* if(naturalFeaturesList.contains(cw)) { featval=1; } }
*
* addFeature(f, "0_lword_natfeat_" + featval);
*/
// Chunk is on the <toponym> list
/*
* featval=0;
*
* if(toponymList.contains(np)) { featval=1; }
*
* addFeature(f, "0_lword_natfeat_" + featval);
*/
// 13. If the chunk appears on <sports> or <newspaper> list or
// <TV station>
featval = 1;
if (newsPaperList.contains(np) || sportsTeamsList.contains(np)) // ||
// tvStationList.contains(np)
{
featval = 0;
}
addFeature(f, "0_lword_NonLoc_" + featval);
// 12. If a word in the chunk is on <personal name>, and each
// word in chunk is upper case…might be preceded by <name title>
// and sometimes period
featval = 1;
count = 0;
for (String cw : chunkWords) {
if (personNamesList.contains(cw) && MPHCAPbool(cw)) {
count += 1;
}
}
if (count == chunkWords.length) {
featval = 0;
}
addFeature(f, "0_lword_Person_" + featval);
// 2. Phrase might not start with, but include letters and
// numerals or word-number(s) [requires word list of numbers]
featval = 0;
for (String cw : chunkWords) {
if (numbersList.contains(cw) || cw.matches(".*\\d.*")) {
featval = 1;
}
}
addFeature(f, "0_lword_Numerals_" + featval);
// Chunk matching <toponym> or <street> <location abbreviation>
// or <building/business> or <unnamed location> or <named
// natural feature> list word is preceded by <spatial verb>
// within 5 words of the phrase
// cross the united states
int s = sen.indexOf(np);
String preString = sen.substring(0, s - 1);
String[] preStrings = preString.split(" ");
featval = 0;
int l = preStrings.length;
// Chunk matching <unnamed location>
if (unnamedLocationsList.contains(lastWord.toLowerCase())
|| unnamedLocationsList.contains(last2Words
.toLowerCase())) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialVerbsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialVerbsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialVerbsList.contains(preStrings[l - 3]))
featval = 1;
}
if (l - 3 > 0) {
if (spatialVerbsList.contains(preStrings[l - 4]))
featval = 1;
}
}
}
addFeature(f, "0_unnamedLocation_spatialverbs_" + featval);
// Chunk matching <toponym>
featval = 0;
if (toponymsList.contains(np)) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialVerbsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialVerbsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialVerbsList.contains(preStrings[l - 3]))
featval = 1;
}
if (l - 3 > 0) {
if (spatialVerbsList.contains(preStrings[l - 4]))
featval = 1;
}
}
}
addFeature(f, "0_toponyms_spatialverbs_" + featval);
// Chunk matching <street>
featval = 0;
if (streetsuffixList.contains(lastWord)) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialVerbsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialVerbsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialVerbsList.contains(preStrings[l - 3]))
featval = 1;
}
if (l - 3 > 0) {
if (spatialVerbsList.contains(preStrings[l - 4]))
featval = 1;
}
}
}
addFeature(f, "0_street_spatialverbs_" + featval);
// Chunk matching <<named natural feature>>
featval = 0;
if (naturalFeaturesList.contains(np)) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialVerbsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialVerbsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialVerbsList.contains(preStrings[l - 3]))
featval = 1;
}
if (l - 3 > 0) {
if (spatialVerbsList.contains(preStrings[l - 4]))
featval = 1;
}
}
}
addFeature(f, "0_naturalFeatures_spatialverbs_" + featval);
// Chunk matching <<building/business>>
featval = 0;
if (namedOrganizationsList.contains(np)) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialVerbsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialVerbsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialVerbsList.contains(preStrings[l - 3]))
featval = 1;
}
if (l - 3 > 0) {
if (spatialVerbsList.contains(preStrings[l - 4]))
featval = 1;
}
}
}
addFeature(f, "0_building_spatialverbs_" + featval);
// Chunk matching <toponym> phrase and it is preceded within 3
// by <spatial preposition indicator> featval=0;
if (unnamedLocationsList.contains(lastWord.toLowerCase())
|| unnamedLocationsList.contains(last2Words
.toLowerCase())) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialPrepsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialPrepsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialPrepsList.contains(preStrings[l - 3]))
featval = 1;
}
}
}
addFeature(f, "0_lword_unnamedLocation_spatialprep_" + featval);
featval = 0;
if (toponymsList.contains(np)) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialPrepsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialPrepsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialPrepsList.contains(preStrings[l - 3]))
featval = 1;
}
}
}
addFeature(f, "0_lword_toponym_spatialprep_" + featval);
// Chunk matching <street>
featval = 0;
if (streetsuffixList.contains(np)) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialPrepsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialPrepsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialPrepsList.contains(preStrings[l - 3]))
featval = 1;
}
}
}
addFeature(f, "0_lword_street_spatialprep_" + featval);
featval = 0;
if (naturalFeaturesList.contains(np)) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialPrepsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialPrepsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialPrepsList.contains(preStrings[l - 3]))
featval = 1;
}
}
}
addFeature(f, "0_lword_naturalFeature_spatialprep_" + featval);
featval = 0;
if (namedOrganizationsList.contains(np)) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialPrepsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialPrepsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialPrepsList.contains(preStrings[l - 3]))
featval = 1;
}
}
}
addFeature(f, "0_lword_building_spatialprep_" + featval);
// Chunk matching <toponym> phrase and it is preceded within 3
// by <spatial relationships indicator> featval=0;
if (unnamedLocationsList.contains(lastWord.toLowerCase())
|| unnamedLocationsList.contains(last2Words
.toLowerCase())) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialRelationsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialRelationsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialRelationsList.contains(preStrings[l - 3]))
featval = 1;
}
}
}
addFeature(f, "0_lword_unnamedLocation_spatialrelation_" + featval);
featval = 0;
if (toponymsList.contains(np)) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialPrepsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialPrepsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialPrepsList.contains(preStrings[l - 3]))
featval = 1;
}
}
}
addFeature(f, "0_lword_toponym_spatialrelation_" + featval);
// Chunk matching <street>
featval = 0;
if (streetsuffixList.contains(np)) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialPrepsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialPrepsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialPrepsList.contains(preStrings[l - 3]))
featval = 1;
}
}
}
addFeature(f, "0_lword_street_spatialrelation_" + featval);
featval = 0;
if (naturalFeaturesList.contains(np)) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialPrepsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialPrepsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialPrepsList.contains(preStrings[l - 3]))
featval = 1;
}
}
}
addFeature(f, "0_lword_naturalFeature_spatialrelation_" + featval);
featval = 0;
if (namedOrganizationsList.contains(np)) {
if (!(preStrings == null) || !(preStrings.length == 0)) {
if (spatialPrepsList.contains(preStrings[l - 1]))
featval = 1;
if (l - 1 > 0) {
if (spatialPrepsList.contains(preStrings[l - 2]))
featval = 1;
}
if (l - 2 > 0) {
if (spatialPrepsList.contains(preStrings[l - 3]))
featval = 1;
}
}
}
addFeature(f, "0_lword_building_spatialrelation_" + featval);
break;
}
}
if (wasInNPChunks == 0) {
addFeature(f, "0_lword_Unloc_" + 0);
addFeature(f, "0_lword_FirstCap_" + 0);
addFeature(f, "0_lword_NamedOrgIndicator_" + 0);
addFeature(f, "0_lword_StreetSuffix_" + 0);
addFeature(f, "0_lword_NonLoc_" + 0);
addFeature(f, "0_lword_Person_" + 0);
addFeature(f, "0_lword_Numerals_" + 0);
addFeature(f, "0_unnamedLocation_spatialverbs_" + featval);
addFeature(f, "0_toponym_spatialverbs_" + featval);
addFeature(f, "0_street_spatialverbs_" + featval);
addFeature(f, "0_naturalFeature_spatialverbs_" + featval);
addFeature(f, "0_building_spatialverbs_" + featval);
addFeature(f, "0_lword_unnamedLocation_spatialprep_" + featval);
addFeature(f, "0_lword_toponym_spatialprep_" + featval);
addFeature(f, "0_lword_street_spatialprep_" + featval);
addFeature(f, "0_lword_naturalFeature_spatialprep_" + featval);
addFeature(f, "0_lword_building_spatialprep_" + featval);
addFeature(f, "0_lword_unnamedLocation_spatialrelation_" + featval);
addFeature(f, "0_lword_toponym_spatialrelation_" + featval);
addFeature(f, "0_lword_street_spatialrelation_" + featval);
addFeature(f, "0_lword_naturalFeature_spatialrelation_" + featval);
addFeature(f, "0_lword_building_spatialrelation_" + featval);
}
}
/*
* Parse feature. Label each token with its incoming edge
*/
private static void genParseFeatures(List<Feature> f, String[] t_tweet,
Map<String, String> parentEdge,
Map<String, ArrayList<String>> childrenEdge, int i) {
addFeature(f, "0_cont_Pedge_" + parentEdge.get(t_tweet[i]));
// If incoming link is *subj* / *obj*
if (parentEdge.get(t_tweet[i]) != null
&& parentEdge.get(t_tweet[i]).matches(
"(.*)subj(.*)|(.*)obj(.*)"))
addFeature(f, "0_cont_subORobj_" + true);
else
addFeature(f, "0_cont_subORobj_" + false);
// Any of the parent's link is subj/obj
// This needs to be coded in the stanford NLP since we dont have access
// to tree here
// as of now.
if ((parentEdge.get(t_tweet[i]) != null && parentEdge.get(t_tweet[i])
.matches("(.*)prep(.*)"))
|| (i - 1 > 0 && parentEdge.get(t_tweet[i - 1]) != null && parentEdge
.get(t_tweet[i - 1]).matches("(.*)prep(.*)"))
|| (i - 2 > 0 && parentEdge.get(t_tweet[i - 2]) != null && parentEdge
.get(t_tweet[i - 2]).matches("(.*)prep(.*)"))
|| (i - 3 > 0 && parentEdge.get(t_tweet[i - 3]) != null && parentEdge
.get(t_tweet[i - 3]).matches("(.*)prep(.*)")))
addFeature(f, "0_cont_prep_" + true);
else
addFeature(f, "0_cont_prep_" + false);
if (childrenEdge.containsKey(t_tweet[i]))
addFeature(
f,
"0_childrenPOS_"
+ Arrays.toString(
childrenEdge.get(t_tweet[i]).toArray())
.replace(" ", ""));
else
addFeature(f, "0_childrenPOS_" + "None");
}
/*
* Wordnet features
*/
private static void genWordnetFeatures(List<Feature> f, String[] tokens,
int i) {
ArrayList<String> wordlist = new ArrayList<String>();
Set<String> wordnet = new HashSet<String>();
String res = "false";
wordlist.add("structure");
wordlist.add("building");
wordlist.add("room");
wordlist.add("factory");
wordlist.add("office");
wordlist.add("institution");
wordlist.add("location");
wordlist.add("place");
wordlist.add("position");
wordlist.add("area");
wordlist.add("region");
wordnet = WordnetApi.WordnetFeature(tokens[i]);
for (String w : wordlist) {
if (wordnet.contains(w))
res = "true";
}
addFeature(f, "0_wordnet_" + res);
}
// //////////////////////////////////////////////
/**
* In the List OR NOT.
*
* INPUT RAW TOKENS OUTPUT BINARY VALUE YES OR NO.
*
* @param f
* @param t_tweet
* @param i
*/
// prep-2.prep-1
private static void genLookupListFeatures(List<Feature> f, String[] t_data,
int i) {
// System.out.println(t_data[i]);
addFeature(
f,
"Presence_LUnnamedLocation_"
+ unnamedLocationsList.contains(TOKLW(t_data[i])));
addFeature(
f,
"Presence_LPersonNames_"
+ personNamesList.contains(TOKLW(t_data[i])));
addFeature(
f,
"Presence_LNamedOrganization_"
+ namedOrganizationsList.contains(TOKLW(t_data[i])));
addFeature(f,
"Presence_LToponym_" + toponymsList.contains(TOKLW(t_data[i])));
// Partial presence ( a part of the location word contains token
addFeature(
f,
"Presence_LUnnamedLocationPartial_"
+ containsPartial(unnamedLocationsList,
TOKLW(t_data[i])));
addFeature(
f,
"Presence_LPersonNamesPartial_"
+ containsPartial(personNamesList, TOKLW(t_data[i])));
addFeature(
f,
"Presence_LNamedOrganizationPartial_"
+ containsPartial(namedOrganizationsList,
TOKLW(t_data[i])));
// System.out.println("Presence_LUnnamedLocation_" +
// unnamedLocationsList.contains(TOKLW(t_data[i])));
}
private static void genBrownClusterFeatures(List<Feature> f,
String[] t_data, int i) throws IOException {
if (clusters.containsKey(TOKLW(t_data[i]))
&& clusters.get(TOKLW(t_data[i])) != null) {
// System.out.println(clusters.get(TOKLW(t_data[i])));
addFeature(f, "BrownCluster_" + clusters.get(TOKLW(t_data[i])));
}
else
addFeature(f, "BrownCluster_-1");
}
// lemma, lower, POS feature
private static void genLemmaFeatures(List<Feature> f, String[] t_data,
String[] lemma, String[] POS, int i) {
// System.out.println(t_data[i]);
addFeature(f, "lemma_" + lemma[i]);
/*
* if((i-1)>0 && (i-2)>0 && (i-3)>0)
* addFeature(f,"POS_"+POS[i-1]+POS[i-2]+ POS[i-3]);
*/
addFeature(f, "lower_" + TOKLW(t_data[i]));
// System.out.println("Presence_LUnnamedLocation_" +
// unnamedLocationsList.contains(TOKLW(t_data[i])));
}
/**
* COUNTRY GAZ EXISTENCE
*
* @param f
* @param f_country
* @param i
*/
/**
* POINT POS FOR EACH SURROUNDING WORD POS SEQUENCE
*
* @param f
* @param f_pos
* @param i
*/
// pos.seq-3-1.seq+1+3
private static void genPosFeatures(List<Feature> f, String[] f_pos, int i) {
int t_length = f_pos.length;
// f5 PART OF SPEECH
// CURRENT WORD
addFeature(f, "0.pos." + f_pos[i]);
String posleft = "", posright = "";
if (i - 4 >= 0) {
addFeature(f, "-4.pos." + f_pos[i - 4]);
posleft += f_pos[i - 4];
} else
addFeature(f, "-4.pos." + "false");
if (i - 3 >= 0) {
addFeature(f, "-3.pos." + f_pos[i - 3]);
posleft += f_pos[i - 3];
} else
addFeature(f, "-3.pos." + "false");
if (i - 2 >= 0) {
addFeature(f, "-2.pos." + f_pos[i - 2]);
posleft += f_pos[i - 2];
} else
addFeature(f, "-2.pos." + "false");
if (i - 1 >= 0) {
addFeature(f, "-1.pos." + f_pos[i - 1]);
posleft += f_pos[i - 1];
} else
addFeature(f, "-1.pos." + "false");
if (i + 1 <= t_length - 1) {
addFeature(f, "+1.pos." + f_pos[i + 1]);
posright += f_pos[i + 1];
} else
addFeature(f, "+1.pos." + "false");
if (i + 2 <= t_length - 1) {
addFeature(f, "+2.pos." + f_pos[i + 2]);
posright += f_pos[i + 2];
} else
addFeature(f, "+2.pos." + "false");
if (i + 3 <= t_length - 1) {
addFeature(f, "+3.pos." + f_pos[i + 3]);
posright += f_pos[i + 3];
} else
addFeature(f, "+3.pos." + "false");
if (i + 4 <= t_length - 1) {
addFeature(f, "+4.pos." + f_pos[i + 4]);
posright += f_pos[i + 4];
} else
addFeature(f, "+4.pos." + "false");
addFeature(f, "-pos_seq_" + posleft);
addFeature(f, "+pos_seq_" + posright);
}
/**
* CAPITALIZATION SEQUENCE POINT CAPs OF SURROUNDING WORDS CAP SEQUENCEs
*
* @param f
* @param t_tweet
* @param i
*/
// cap.seq-3-1.seq+1+3
private static void genCapFeatures(List<Feature> f, String[] t_tweet, int i) {
int t_length = t_tweet.length;
// CURRENT WORD
addFeature(f, "0_mph_cap_" + MPHCAP(t_tweet[i]));
String left = "", right = "";
if (i - 4 >= 0) {
// addFeature(f, "-4_mph_cap_" + MPHCAP(t_tweet[i - 4]));
// left += MPHCAP(t_tweet[i - 4]);
}
if (i - 3 >= 0) {
addFeature(f, "-3_mph_cap_" + MPHCAP(t_tweet[i - 3]));
// left += MPHCAP(t_tweet[i - 3]);
} else
addFeature(f, "-3_mph_cap_" + "false");
if (i - 2 >= 0) {
addFeature(f, "-2_mph_cap_" + MPHCAP(t_tweet[i - 2]));
left += MPHCAP(t_tweet[i - 2]);
} else
addFeature(f, "-2_mph_cap_" + "false");
if (i - 1 >= 0) {
addFeature(f, "-1_mph_cap_" + MPHCAP(t_tweet[i - 1]));
left += MPHCAP(t_tweet[i - 1]) + "::";
} else
addFeature(f, "-1_mph_cap_" + "false");
if (i + 1 <= t_length - 1) {
addFeature(f, "+1_mph_cap_" + MPHCAP(t_tweet[i + 1]));
right += MPHCAP(t_tweet[i + 1]);
} else
addFeature(f, "+1_mph_cap_" + "false");
if (i + 2 <= t_length - 1) {
addFeature(f, "+2_mph_cap_" + MPHCAP(t_tweet[i + 2]));
right += MPHCAP(t_tweet[i + 2]);
} else
addFeature(f, "+2_mph_cap_" + "false");
if (i + 3 <= t_length - 1) {
addFeature(f, "+3_mph_cap_" + MPHCAP(t_tweet[i + 3]));
// right += MPHCAP(t_tweet[i + 3]);
} else
addFeature(f, "+3_mph_cap_" + "false");
if (i + 4 <= t_length - 1) {
// addFeature(f, "+4_mph_cap_" + MPHCAP(t_tweet[i + 4]));
// right += MPHCAP(t_tweet[i + 4]);
}
addFeature(f, "-_mph_cap_seq_" + left);
addFeature(f, "+_mph_cap_seq_" + right);
addFeature(f, "-+_mph_cap_seq_" + left + right);
}
/**
* CONTEXT WORD (LEMMA) EXISTENCE The bag of words feature, and position
* appearance feature together. 1. Each lemma is added in bag of context
* words 2. Each position has an presence feature for determining the
* existence of the window position.
*
* @param f
* : Feature list
* @param lemmat_tweet
* : lemmas of the tweet,
* @param i
* : position of the current word
*/
/**
* CAPITALIZATION
*
* @param string
* @return boolean
*/
private static String MPHCAP(String string) {
boolean a = Character.isUpperCase(string.charAt(0));
return Boolean.toString(a);
}
private static Boolean MPHCAPbool(String string) {
boolean a = Character.isUpperCase(string.charAt(0));
return a;
}
/**
* CONVERT TO LOWER TYPE Input the lemma, 1. Run tokentype() to convert to
* token 2. lowercase and deaccent the lemma.
*
* @param lemmastring
* @return
*/
private static String TOKLW(String lemmastring) {
lemmastring = StringUtil
.getDeAccentLoweredString(tokentype(lemmastring));
return lemmastring;
}
// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// TOOLS
// //////////////////////////////////
/**
* JUDGE EMPTY OF AN ARRAY.
*
* @param array
* @return
*/
static boolean EmptyArray(String[] array) {
if (array.length < 2)
if (array[0].equals(""))
return true;
return false;
}
// ////////////////////////////////////////////////////////////////////////////////
// HELPER FOR FEATURE VECTOR
// /////////////////////////////////////////
static StringBuilder sb = new StringBuilder();
/**
* helper for building feature vector. sb stores the features on a line, and
* this func is used to initialize the sb, aka, clear the builder.
*/
private static void initialFeatureWriter() {
sb = new StringBuilder();
}
private static void append(String featurestring) {
if (sb.length() > 0)
sb.append("\t");
sb.append(featurestring);
}
static String emit() {
return sb.append("\n").toString();
}
private static void addFeature(List<Feature> features, String string) {
features.add(new Feature(string));
}
// ////////////////////////////////////////////////////////////////////////////////////
// GETTER AND SETTERS /////
public HashSet<String> getPreposition() {
return preposition;
}
public void setPreposition(HashSet<String> preposition) {
this.preposition = preposition;
}
public HashSet<String> getCountries() {
return countries;
}
public void setCountries(HashSet<String> countries) {
this.countries = countries;
}
public IndexSupportedTrie getTrie() {
return trie;
}
public void setTrie(IndexSupportedTrie trie) {
this.trie = trie;
}
public static String ParseFineLine(String line,
HashMap<String, String> tagdata) {
String data = line.replaceAll("\\<.*?>", "");
// System.out.println(data);
String reg = "<.*?>(.*?)</.*?>";
Pattern p = Pattern.compile(reg);
Matcher m = p.matcher(line);
while (m.find()) {
String tag = m.group(0).split(">")[0].replace("<", "");
String[] s1 = m.group(1).split(" ");
// System.out.println(tag + ' '+s1[0]);
int i = 0;
for (String w : s1) {
if (w.equals("Telefonica"))
System.err.println(tag + "," + m.group(1));
if (i == 0)
tagdata.put(w, "B-" + tag);
else
tagdata.put(w, "I-" + tag);
i++;
}
// System.out.println(tagdata.get(s1[0]));
}
return data;
}
//
// public static String[] DataTokenizer(String data)
// {
// String[] TokenizedData = data.split(" ");
//
// return TokenizedData;
//
// }
/*
* Assumes tokens contains the tags. eg:
* ["I","am","in","<Toponym>","New","York","</Toponym>","."];
*/
public static String[] TokentoTag(String[] Tokens,
ArrayList<String> newTokens) {
String[] Tags = new String[Tokens.length];
String startReg = "<.*?>";
String endReg = "</.*?>";
String curr_tag = "O";
// ArrayList<String> newTokens = new ArrayList<String>();
int i = 0;
for (String w : Tokens) {
if (w.matches(startReg) && !w.matches(endReg)) {
curr_tag = w.replace("<", "").replace(">", "");
continue;
}
if (w.matches(endReg)) {
curr_tag = "O";
continue;
}
if (w.equals("<") || w.equals(">") || w.equals("\\")
|| w.equals("<\\")) {
System.err.println(w);
}
newTokens.add(w);
Tags[i] = curr_tag;
i++;
}
assert (newTokens.size() == Tokens.length);
return Tags;
}
public static String[] TokentoTag(String[] Tokens, HashMap Tagdata) {
String[] Tags = new String[Tokens.length];
Integer i = 0;
for (String token : Tokens) {
if (Tagdata.get(token) == null)
Tags[i] = "O";
else
Tags[i] = (String) Tagdata.get(token);
i++;
}
return Tags;
}
public static String[] TokentoBIOTag(String[] Tokens,
ArrayList<String> newTokens) {
String[] Tags = new String[Tokens.length];
String startReg = "<.*?>";
String endReg = "</.*?>";
String curr_tag = "O";
int start = 0;
// ArrayList<String> newTokens = new ArrayList<String>();
int i = 0;
for (String w : Tokens) {
if (w.matches(startReg) && !w.matches(endReg)) {
curr_tag = w.replace("<", "").replace(">", "");
continue;
}
if (w.matches(endReg)) {
curr_tag = "O";
start = 0;
continue;
}
if (w.equals("<") || w.equals(">") || w.equals("\\")
|| w.equals("<\\")) {
System.err.println(w);
}
newTokens.add(w);
if (curr_tag.equals("O"))
start = 0;
else {
if (start == 0)
curr_tag = "B-" + curr_tag;
else if (start == 1)
curr_tag = curr_tag.replace("B-", "I-");
start++;
}
Tags[i] = curr_tag;
i++;
}
assert (newTokens.size() == Tokens.length);
return Tags;
}
/**
* CONVERT TO TYPE Naively decide the tweet token type, url, or hashtag, or
* metion, or number. Or it's not any of them, just return it's original
* string.
*
* @param token
* @return
*/
public static String tokentype(String token) {
// lower cased word.
String ltoken = StringUtil.getDeAccentLoweredString(token.trim());
if (ltoken.startsWith("http:") || ltoken.startsWith("www:")) {
ltoken = "[http]";
} else if (ltoken.startsWith("@") || ltoken.startsWith("#")) {
if (ltoken.length() > 1) {
ltoken = ltoken.substring(1);
}
}
try {
Double.parseDouble(ltoken);
ltoken = "[num]";
} catch (NumberFormatException e) {
}
return ltoken;
}
}