package edu.cmu.geolocator.nlp.ner.FeatureExtractor;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.cmu.geolocator.common.StringUtil;
import edu.cmu.geolocator.model.Sentence;
import edu.cmu.geolocator.nlp.StanfordCoreTools.StanfordNLP;
import edu.cmu.geolocator.nlp.tokenizer.EuroLangTwokenizer;
import edu.cmu.geolocator.resource.dictionary.Dictionary;
import edu.cmu.geolocator.resource.trie.IndexSupportedTrie;
import edu.cmu.minorthird.classify.Feature;
import Wordnet.*;
public class FineFeatureGenerator extends FeatureGenerator{
HashSet<String> preposition, countries;
static HashMap<String,String> clusters ;
Dictionary prepdict, countrydict;
static StanfordNLP snlp;
IndexSupportedTrie trie;
static ArrayList<String> naturalFeaturesList ;
static public ArrayList<String> unnamedLocationsList ;
static ArrayList<String> personNamesList ;
static ArrayList<String> sportsTeamsList ;
static ArrayList<String> namedOrganizationsList ;
static ArrayList<String> namedOrgIndicatorList ;
static ArrayList<String> spatialVerbsList ;
static ArrayList<String> spatialRelationsList ;
static ArrayList<String> spatialPrepsList ;
static ArrayList<String> streetsuffixList ;
static ArrayList<String> newsPaperList ;
static ArrayList<String> numbersList ;
static HashSet<String> toponymsList ;
public static void readAllLists() throws IOException{
namedOrganizationsList = readListFile("LNamedOrganization");
unnamedLocationsList = readListFile("LUnnamedLocation");
namedOrgIndicatorList = readListFile("LNamedOrgIndicator");
spatialVerbsList = readListFile("LSpatialVerbs");
spatialRelationsList = readListFile("LSpatialRelations");
personNamesList = readListFile("LPersonNames");
spatialPrepsList = readListFile("LSpatialPreps");
streetsuffixList = readListFile("LStreetSuffix");
sportsTeamsList = readListFile("LSportsTeams");
newsPaperList = readListFile("LNewsPapers");
numbersList = readListFile("LNumbers");
toponymsList = readSetFile("LAllCountries"); //LAllCountries");
/*
System.out.println(LNaturalFeatures.get(2));
System.out.println(LUnnamedLocations.get(2));
System.out.println(LNamedOrganizations.get(2));
*/
}
public static boolean containsPartial(ArrayList<String> list, String word){
for(String tmp : list)
if(tmp.contains(word))
return true;
return false;
}
public static ArrayList<String> readListFile(String FileName) throws IOException{
ArrayList<String> list = new ArrayList<String>();
String filename = "res/lists/"+FileName+".txt";
System.err.println("Reading file:"+filename);
BufferedReader reader = new BufferedReader(new FileReader(filename));
String line = null;
while ((line = reader.readLine()) != null) {
//Lower casing before adding to list
list.add(line.trim().toLowerCase());
}
reader.close();
return list;
}
public static void ReadBrownCluster(String filename) throws IOException{
clusters = new HashMap<String,String>();
String cluster ="",line="", word=""; int check=0;
BufferedReader bw= new BufferedReader(new FileReader(filename));
while((line= bw.readLine())!= null)
{
word=line.split("\t")[1];
cluster = line.split("\t")[0];
//System.out.println("brownbrownw"+word+cluster);
clusters.put(word, cluster);
}
System.out.println("BC DONE");
}
public static HashSet<String> readSetFile(String FileName) throws IOException{
HashSet<String> set = new HashSet<String>();
String filename = "res/Lists/"+FileName+".txt";
System.err.println("Reading file:"+filename);
BufferedReader reader = new BufferedReader(new FileReader(filename));
String line = null;
while ((line = reader.readLine()) != null) {
//Lower casing before adding to list
set.add(line.trim().toLowerCase());
}
reader.close();
return set;
}
public FineFeatureGenerator() {
super();
// initialize dictionary to lookup.
// "geoNames.com/allCountries.txt"
snlp = new StanfordNLP();
// Stanford - Lemmatizer, tokenizer, NER, POS
if ( unnamedLocationsList ==null )
try {
readAllLists();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (clusters ==null )
try {
ReadBrownCluster("res/brownclusters/paths");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
static int statstreet = 0;
static int statbuilding = 0;
static int stattoponym = 0;
static int statabbr = 0, statadj = 0;
String tweet;
public static ArrayList<String> FeaturelistGen (String sentence,FineFeatureGenerator fgen) throws IOException {
ArrayList<String> FeatureList = new ArrayList<String>();
String[] simpleTokenizedData= null;
String[] tokenTags = null;
//String[] posTags=null,nerTags=null,lemma=null;
HashMap tagdata = new HashMap <String,String>();
//int length=0;
snlp.Tokenizer(sentence);
simpleTokenizedData= snlp.StringTokenizer(sentence);
StringBuffer bw = new StringBuffer();
if(simpleTokenizedData!=null || simpleTokenizedData.length != 0)
{
ArrayList<String> newTokens = new ArrayList<String>();
tokenTags = TokentoBIOTag(simpleTokenizedData,newTokens);
String [] tokenizedData = new String[newTokens.size()];
tokenizedData = newTokens.toArray(tokenizedData);
// Extract features
//bw.write(data);
List<Feature[]> tokenFeatures = fgen.extractFeature(tokenizedData);
// Write feature + tag for each token
for (int j = 0; j < tokenFeatures.size(); j++) {
bw=new StringBuffer();
initialFeatureWriter();
bw.append(tokenizedData[j]+ " ");
//bw.write(tokenTags[j]);
for (Feature f : tokenFeatures.get(j)) {
append(f.toString());
bw.append(f.toString()+" ");
}
bw.append(" ");
// location class.
String loctag = tokenTags[j];
//append(loctag);
bw.append(loctag + " ");
//fwriter.write(emit());
bw.append("\n");
FeatureList.add(bw.toString());
}
//fwriter.write("\n");
return FeatureList ;
}
else return null;
}
public static void main(String argv[]) throws IOException, InterruptedException {
FineFeatureGenerator fgen = new FineFeatureGenerator();
String sen = "cross the United States";
ArrayList<String> FeatureList = new ArrayList<String>();
FeatureList= FeaturelistGen(sen, fgen );
for(String fn: FeatureList )
{
System.out.println(fn+ " ");
}
}
@Override
public List<Feature[]> extractFeature(Sentence tweetSentence) {
try {
return extractFeature(snlp.StringTokenizer(tweetSentence.getSentenceString()));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
/**
* MAIN FUNCTION FOR EXTRACTIN FEATURES
*
* @param t_tweet
* @param trie
* @param postags
* @return FEATURE LISTS
* @throws IOException
*/
public List<Feature[]> extractFeature(String[] tokens) throws IOException {
List<List<Feature>> instances = new ArrayList<List<Feature>>(tokens.length);
List<Feature> f = new ArrayList<Feature>();
String[] posTags=null,nerTags=null,lemma=null;
int length= tokens.length;
posTags = new String[length+1];
nerTags = new String[length+1];
lemma = new String[length+1];
// Parse features
Map<String,String> parentEdge = new HashMap<String,String>();
Map<String,ArrayList<String>> childrenEdge = new HashMap<String,ArrayList<String>>();
ArrayList<String> npChunks = null;
snlp.DoAll(tokens,posTags,lemma,parentEdge,childrenEdge);
npChunks = snlp.NPChunker(tokens, posTags);
for (int i = 0; i < tokens.length; i++) {
// clear feature list for this loop
f = new ArrayList<Feature>();
genLemmaFeatures(f, tokens,lemma,posTags,i); // lemma & lower case
genBrownClusterFeatures(f, tokens,i);
// /////////////////////////////// MORPH FEATURES
//genTokenFeatures(f, lemmadata, i);
genCapFeatures(f, tokens, i);
// ////////////////////////////// SEMANTIC FEATURES
//genPosFeatures(f, posTags, i);// I think this function is useless
// ////////////////////////////// SEMANTIC FEATURES
genChunkFeatures(f, tokens, posTags, npChunks, i);
/////////////////////////////////// List Features
genLookupListFeatures(f, tokens, i);
genParseFeatures(f, tokens, parentEdge, childrenEdge,i);
genWordnetFeatures(f, tokens,i);
instances.add(f);
}
// convert array to output format.
ArrayList<Feature[]> newinstances = new ArrayList<Feature[]>();
for (int i1 = 0; i1 < instances.size(); i1++) {
newinstances.add(instances.get(i1).toArray(new Feature[] {}));
}
return newinstances;
}
private void genChunkFeatures(List<Feature> f, String[] tokens, String[] posTags, ArrayList<String> npChunks, int i) {
// Last word OR last two words are in unnamed location list
int featval=0,count=0;
int wasInNPChunks=0;
for (String np : npChunks){
if (np.contains(tokens[i])){
String[] chunkWords = np.split(" ");
wasInNPChunks=1;
//Check for last word feature
String lastWord = np.substring(np.lastIndexOf(' ') + 1);
if (unnamedLocationsList.contains(lastWord.toLowerCase()))
featval=1;
// Check for last 2 words
String[] npSplit = np.split(" ");
int length =npSplit.length;
if (length >= 2){
String last2Words = npSplit[length-2]+" "+npSplit[length-1];
if (unnamedLocationsList.contains(last2Words.toLowerCase()))
featval=1;
}
addFeature(f, "0_lword_Unloc_" + featval);
// First Letter Capitalized in each word of the chunk
featval=0;
for (String cw : chunkWords)
{
if(MPHCAPbool(cw))
{
count+=1;
}
}
if(count==chunkWords.length)
{
featval = 1;
}
addFeature(f, "0_lword_FirstCap_" + featval);
// Last word OR last two words are in named organization indicator list
featval=0;
if (namedOrgIndicatorList.contains(lastWord))
featval=1;
addFeature(f, "0_lword_NamedOrgIndicator_" + featval);
// Last word OR last two words are in street list
featval=0;
if (streetsuffixList.contains(lastWord))
featval=1;
addFeature(f, "0_lword_StreetSuffix_" + featval);
// Word in the chunk is on the <natural features> list
/*featval=0;
for (String cw : chunkWords)
{
if(naturalFeaturesList.contains(cw))
{
featval=1;
}
}
addFeature(f, "0_lword_natfeat_" + featval); */
// Chunk is on the <toponym> list
/*featval=0;
if(toponymList.contains(np))
{
featval=1;
}
addFeature(f, "0_lword_natfeat_" + featval);
*/
// 13. If the chunk appears on <sports> or <newspaper> list or <TV station>
featval=1;
if(newsPaperList.contains(np) || sportsTeamsList.contains(np)) //|| tvStationList.contains(np)
{
featval=0;
}
addFeature(f, "0_lword_NonLoc_" + featval);
//12. If a word in the chunk is on <personal name>, and each word in chunk is upper case…might be preceded by <name title> and sometimes period
featval=1; count =0;
for (String cw : chunkWords)
{
if(personNamesList.contains(cw) && MPHCAPbool(cw))
{
count+=1;
}
}
if(count==chunkWords.length)
{
featval = 0;
}
addFeature(f, "0_lword_Person_" + featval);
//2. Phrase might not start with, but include letters and numerals or word-number(s) [requires word list of numbers]
featval=0;
for (String cw : chunkWords)
{
if(numbersList.contains(cw) || cw.matches(".*\\d.*"))
{
featval=1;
}
}
addFeature(f, "0_lword_Numerals_" + featval);
//Chunk matching <toponym> or <street> <location abbreviation> or <building/business> or <unnamed location> or <named natural feature> list word is preceded by <spatial verb> within 5 words of the phrase
featval=0;
//if(toponymsList.contains(np) || streetsuffixList.contains(np) || unnamedLocationsList.contains(np))
if((spatialVerbsList.contains(i-1) && (i-1)>0 ) || (spatialVerbsList.contains(i-2) && (i-2)>0 ) || (spatialVerbsList.contains(i-3) && (i-3)>0 )||(spatialVerbsList.contains(i-4) && (i-4)>0 )||(spatialVerbsList.contains(i-5) && (i-5)>0 ))
featval=1;
addFeature(f, "0_lword_spatialverbs_" + featval);
//Chunk matching <toponym> or <street> <location abbreviation> or <building/business> or <unnamed location> or <named natural feature> list word or phrase is preceded within 3 by <spatial preposition indicator> featval=0;
featval=0;
if((spatialPrepsList.contains(i-1) && (i-1)>0 ) || (spatialPrepsList.contains(i-2) && (i-2)>0 ) || (spatialPrepsList.contains(i-3) && (i-3)>0 ))
featval=1;
addFeature(f, "0_lword_spatialprep_" + featval);
//Chunk matching <toponym> or <street> <location abbreviation> or <building/business> or <unnamed location> or <named natural feature> list is preceded within 5 words of <spatial relation> list word [such as north] featval=0;
featval=0;
//if(toponymsList.contains(np) || streetsuffixList.contains(np) || unnamedLocationsList.contains(np))
if((spatialRelationsList.contains(i-1) && (i-1)>0 ) || (spatialRelationsList.contains(i-2) && (i-2)>0 ) || (spatialRelationsList.contains(i-3) && (i-3)>0 )||(spatialRelationsList.contains(i-4) && (i-4)>0 )||(spatialRelationsList.contains(i-5) && (i-5)>0 ))
featval=1;
addFeature(f, "0_lword_spatialrelations_" + featval);
break;
}
}
if(wasInNPChunks==0)
{
addFeature(f, "0_lword_Unloc_" + 0);
addFeature(f, "0_lword_FirstCap_" + 0);
addFeature(f, "0_lword_NamedOrgIndicator_" + 0);
addFeature(f, "0_lword_StreetSuffix_" + 0);
addFeature(f, "0_lword_NonLoc_" + 0);
addFeature(f, "0_lword_Person_" + 0);
addFeature(f, "0_lword_Numerals_" + 0);
addFeature(f, "0_lword_spatialverbs_" + featval);
addFeature(f, "0_lword_spatialprep_" + featval);
addFeature(f, "0_lword_spatialrelations_" + featval);
}
}
/*
*Parse feature. Label each token with its incoming edge
*
*/
private static void genParseFeatures(List<Feature> f, String[] t_tweet, Map<String,String> parentEdge,Map<String,ArrayList<String>> childrenEdge ,int i) {
addFeature(f, "0_cont_Pedge_" + parentEdge.get(t_tweet[i]));
// If incoming link is *subj* / *obj*
if (parentEdge.get(t_tweet[i]) != null && parentEdge.get(t_tweet[i]).matches("(.*)subj(.*)|(.*)obj(.*)"))
addFeature(f, "0_cont_subORobj_" + true);
else
addFeature(f, "0_cont_subORobj_" + false);
// Any of the parent's link is subj/obj
// This needs to be coded in the stanford NLP since we dont have access to tree here
// as of now.
if ((parentEdge.get(t_tweet[i]) != null && parentEdge.get(t_tweet[i]).matches("(.*)prep(.*)")) || (i-1>0 && parentEdge.get(t_tweet[i-1]) != null && parentEdge.get(t_tweet[i-1]).matches("(.*)prep(.*)")) || (i-2>0 && parentEdge.get(t_tweet[i-2]) != null && parentEdge.get(t_tweet[i-2]).matches("(.*)prep(.*)")) || (i-3>0 && parentEdge.get(t_tweet[i-3]) != null && parentEdge.get(t_tweet[i-3]).matches("(.*)prep(.*)")))
addFeature(f, "0_cont_prep_" + true);
else
addFeature(f, "0_cont_prep_" + false);
if (childrenEdge.containsKey(t_tweet[i]))
addFeature(f, "0_childrenPOS_" + Arrays.toString(childrenEdge.get(t_tweet[i]).toArray()).replace(" ", ""));
else
addFeature(f, "0_childrenPOS_" +"None");
}
/*
* Wordnet features
*
*/
private static void genWordnetFeatures(List<Feature> f, String[] tokens,int i) {
ArrayList<String> wordlist= new ArrayList<String>();
Set<String> wordnet= new HashSet<String>();
String res="false";
wordlist.add("structure");
wordlist.add("building");
wordlist.add("room");
wordlist.add("factory");
wordlist.add("office");
wordlist.add("institution");
wordlist.add("location");
wordlist.add("place");
wordlist.add("position");
wordlist.add("area");
wordlist.add("region");
wordnet = WordnetApi.WordnetFeature(tokens[i]);
for(String w : wordlist)
{
if(wordnet.contains(w))
res="true";
}
addFeature(f, "0_wordnet_" + res);
}
// //////////////////////////////////////////////
/**
* In the List OR NOT.
*
* INPUT RAW TOKENS OUTPUT BINARY VALUE YES OR NO.
*
* @param f
* @param t_tweet
* @param i
*/
// prep-2.prep-1
private static void genLookupListFeatures(List<Feature> f, String[] t_data, int i ) {
//System.out.println(t_data[i]);
addFeature(f, "Presence_LUnnamedLocation_" + unnamedLocationsList.contains(TOKLW(t_data[i])));
addFeature(f, "Presence_LPersonNames_" + personNamesList.contains(TOKLW(t_data[i])));
addFeature(f, "Presence_LNamedOrganization_" + namedOrganizationsList.contains(TOKLW(t_data[i])));
addFeature(f, "Presence_LToponym_" + toponymsList.contains(TOKLW(t_data[i])));
// Partial presence ( a part of the location word contains token
addFeature(f, "Presence_LUnnamedLocationPartial_" + containsPartial(unnamedLocationsList,TOKLW(t_data[i])));
addFeature(f, "Presence_LPersonNamesPartial_" + containsPartial(personNamesList,TOKLW(t_data[i])));
addFeature(f, "Presence_LNamedOrganizationPartial_" + containsPartial(namedOrganizationsList,TOKLW(t_data[i])));
//System.out.println("Presence_LUnnamedLocation_" + unnamedLocationsList.contains(TOKLW(t_data[i])));
}
private static void genBrownClusterFeatures(List<Feature> f, String[] t_data, int i ) throws IOException {
if(clusters.containsKey(TOKLW(t_data[i])) && clusters.get(TOKLW(t_data[i])) != null)
{
//System.out.println(clusters.get(TOKLW(t_data[i])));
addFeature(f, "BrownCluster_" + clusters.get(TOKLW(t_data[i])));
}
else
addFeature(f, "BrownCluster_-1");
}
// lemma, lower, POS feature
private static void genLemmaFeatures(List<Feature> f, String[] t_data, String[] lemma , String[] POS,int i ) {
//System.out.println(t_data[i]);
addFeature(f,"lemma_"+lemma[i]);
/*
if((i-1)>0 && (i-2)>0 && (i-3)>0)
addFeature(f,"POS_"+POS[i-1]+POS[i-2]+ POS[i-3]);
*/
addFeature(f,"lower_"+TOKLW(t_data[i]));
//System.out.println("Presence_LUnnamedLocation_" + unnamedLocationsList.contains(TOKLW(t_data[i])));
}
/**
* COUNTRY GAZ EXISTENCE
*
* @param f
* @param f_country
* @param i
*/
/**
* POINT POS FOR EACH SURROUNDING WORD POS SEQUENCE
*
* @param f
* @param f_pos
* @param i
*/
// pos.seq-3-1.seq+1+3
private static void genPosFeatures(List<Feature> f, String[] f_pos, int i) {
int t_length = f_pos.length;
// f5 PART OF SPEECH
// CURRENT WORD
addFeature(f, "0.pos." + f_pos[i]);
String posleft = "", posright = "";
if (i - 4 >= 0) {
addFeature(f, "-4.pos." + f_pos[i - 4]);
posleft += f_pos[i - 4];
}
else
addFeature(f, "-4.pos." + "false");
if (i - 3 >= 0) {
addFeature(f, "-3.pos." + f_pos[i - 3]);
posleft += f_pos[i - 3];
}
else
addFeature(f, "-3.pos." + "false");
if (i - 2 >= 0) {
addFeature(f, "-2.pos." + f_pos[i - 2]);
posleft += f_pos[i - 2];
}
else
addFeature(f, "-2.pos." + "false");
if (i - 1 >= 0) {
addFeature(f, "-1.pos." + f_pos[i - 1]);
posleft += f_pos[i - 1];
}
else
addFeature(f, "-1.pos." + "false");
if (i + 1 <= t_length - 1) {
addFeature(f, "+1.pos." + f_pos[i + 1]);
posright += f_pos[i + 1];
}
else
addFeature(f, "+1.pos." + "false");
if (i + 2 <= t_length - 1) {
addFeature(f, "+2.pos." + f_pos[i + 2]);
posright += f_pos[i + 2];
}
else
addFeature(f, "+2.pos." + "false");
if (i + 3 <= t_length - 1) {
addFeature(f, "+3.pos." + f_pos[i + 3]);
posright += f_pos[i + 3];
}
else
addFeature(f, "+3.pos." + "false");
if (i + 4 <= t_length - 1) {
addFeature(f, "+4.pos." + f_pos[i + 4]);
posright += f_pos[i + 4];
}
else
addFeature(f, "+4.pos." + "false");
addFeature(f, "-pos_seq_" + posleft);
addFeature(f, "+pos_seq_" + posright);
}
/**
* CAPITALIZATION SEQUENCE POINT CAPs OF SURROUNDING WORDS CAP SEQUENCEs
*
* @param f
* @param t_tweet
* @param i
*/
// cap.seq-3-1.seq+1+3
private static void genCapFeatures(List<Feature> f, String[] t_tweet, int i) {
int t_length = t_tweet.length;
// CURRENT WORD
addFeature(f, "0_mph_cap_" + MPHCAP(t_tweet[i]));
String left = "", right = "";
if (i - 4 >= 0) {
// addFeature(f, "-4_mph_cap_" + MPHCAP(t_tweet[i - 4]));
// left += MPHCAP(t_tweet[i - 4]);
}
if (i - 3 >= 0) {
addFeature(f, "-3_mph_cap_" + MPHCAP(t_tweet[i - 3]));
// left += MPHCAP(t_tweet[i - 3]);
}
else
addFeature(f, "-3_mph_cap_" + "false");
if (i - 2 >= 0) {
addFeature(f, "-2_mph_cap_" + MPHCAP(t_tweet[i - 2]));
left += MPHCAP(t_tweet[i - 2]);
}
else
addFeature(f, "-2_mph_cap_" + "false");
if (i - 1 >= 0) {
addFeature(f, "-1_mph_cap_" + MPHCAP(t_tweet[i - 1]));
left += MPHCAP(t_tweet[i - 1]) + "::";
}
else
addFeature(f, "-1_mph_cap_" + "false");
if (i + 1 <= t_length - 1) {
addFeature(f, "+1_mph_cap_" + MPHCAP(t_tweet[i + 1]));
right += MPHCAP(t_tweet[i + 1]);
}
else
addFeature(f, "+1_mph_cap_" + "false");
if (i + 2 <= t_length - 1) {
addFeature(f, "+2_mph_cap_" + MPHCAP(t_tweet[i + 2]));
right += MPHCAP(t_tweet[i + 2]);
}
else
addFeature(f, "+2_mph_cap_" + "false");
if (i + 3 <= t_length - 1) {
addFeature(f, "+3_mph_cap_" + MPHCAP(t_tweet[i + 3]));
// right += MPHCAP(t_tweet[i + 3]);
}
else
addFeature(f, "+3_mph_cap_" + "false");
if (i + 4 <= t_length - 1) {
// addFeature(f, "+4_mph_cap_" + MPHCAP(t_tweet[i + 4]));
// right += MPHCAP(t_tweet[i + 4]);
}
addFeature(f, "-_mph_cap_seq_" + left);
addFeature(f, "+_mph_cap_seq_" + right);
addFeature(f, "-+_mph_cap_seq_" + left + right);
}
/**
* CONTEXT WORD (LEMMA) EXISTENCE The bag of words feature, and position
* appearance feature together. 1. Each lemma is added in bag of context
* words 2. Each position has an presence feature for determining the
* existence of the window position.
*
* @param f
* : Feature list
* @param lemmat_tweet
* : lemmas of the tweet,
* @param i
* : position of the current word
*/
/**
* CAPITALIZATION
*
* @param string
* @return boolean
*/
private static String MPHCAP(String string) {
boolean a = Character.isUpperCase(string.charAt(0));
return Boolean.toString(a);
}
private static Boolean MPHCAPbool(String string) {
boolean a = Character.isUpperCase(string.charAt(0));
return a;
}
/**
* CONVERT TO LOWER TYPE Input the lemma, 1. Run tokentype() to convert to
* token 2. lowercase and deaccent the lemma.
*
* @param lemmastring
* @return
*/
private static String TOKLW(String lemmastring) {
lemmastring = StringUtil.getDeAccentLoweredString(tokentype(lemmastring));
return lemmastring;
}
// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// TOOLS
// //////////////////////////////////
/**
* JUDGE EMPTY OF AN ARRAY.
*
* @param array
* @return
*/
static boolean EmptyArray(String[] array) {
if (array.length < 2)
if (array[0].equals(""))
return true;
return false;
}
// ////////////////////////////////////////////////////////////////////////////////
// HELPER FOR FEATURE VECTOR
// /////////////////////////////////////////
static StringBuilder sb = new StringBuilder();
/**
* helper for building feature vector. sb stores the features on a line,
* and this func is used to initialize the sb, aka, clear the builder.
*/
private static void initialFeatureWriter() {
sb = new StringBuilder();
}
private static void append(String featurestring) {
if (sb.length() > 0)
sb.append("\t");
sb.append(featurestring);
}
static String emit() {
return sb.append("\n").toString();
}
private static void addFeature(List<Feature> features, String string) {
features.add(new Feature(string));
}
// ////////////////////////////////////////////////////////////////////////////////////
// GETTER AND SETTERS /////
public HashSet<String> getPreposition() {
return preposition;
}
public void setPreposition(HashSet<String> preposition) {
this.preposition = preposition;
}
public HashSet<String> getCountries() {
return countries;
}
public void setCountries(HashSet<String> countries) {
this.countries = countries;
}
public IndexSupportedTrie getTrie() {
return trie;
}
public void setTrie(IndexSupportedTrie trie) {
this.trie = trie;
}
public static String ParseFineLine(String line, HashMap<String, String> tagdata )
{
String data = line.replaceAll("\\<.*?>","");
//System.out.println(data);
String reg= "<.*?>(.*?)</.*?>";
Pattern p = Pattern.compile(reg);
Matcher m = p.matcher(line);
while(m.find())
{
String tag = m.group(0).split(">")[0].replace("<","");
String[] s1 = m.group(1).split(" ");
//System.out.println(tag + ' '+s1[0]);
int i=0;
for (String w : s1){
if (w.equals("Telefonica"))
System.err.println(tag+","+m.group(1));
if (i==0)
tagdata.put(w,"B-"+tag);
else
tagdata.put(w,"I-"+tag);
i++;
}
// System.out.println(tagdata.get(s1[0]));
}
return data;
}
//
//public static String[] DataTokenizer(String data)
//{
// String[] TokenizedData = data.split(" ");
//
// return TokenizedData;
//
//}
/*
* Assumes tokens contains the tags.
* eg: ["I","am","in","<Toponym>","New","York","</Toponym>","."];
*/
public static String[] TokentoTag(String [] Tokens,ArrayList<String> newTokens){
String[] Tags = new String [Tokens.length] ;
String startReg= "<.*?>";
String endReg = "</.*?>";
String curr_tag = "O";
//ArrayList<String> newTokens = new ArrayList<String>();
int i=0;
for (String w: Tokens){
if (w.matches(startReg) && !w.matches(endReg)){
curr_tag=w.replace("<","").replace(">", "");
continue;
}
if (w.matches(endReg)){
curr_tag="O";
continue;
}
if(w.equals("<") || w.equals(">") || w.equals("\\") || w.equals("<\\") ){
System.err.println(w);
}
newTokens.add(w);
Tags[i] = curr_tag;
i++;
}
assert (newTokens.size()==Tokens.length);
return Tags;
}
public static String[] TokentoTag(String[] Tokens, HashMap Tagdata)
{
String[] Tags = new String [Tokens.length] ;
Integer i =0;
for (String token : Tokens)
{
if (Tagdata.get(token)== null)
Tags[i]= "O";
else
Tags[i]= (String) Tagdata.get(token);
i++;
}
return Tags;
}
public static String[] TokentoBIOTag(String [] Tokens,ArrayList<String> newTokens){
String[] Tags = new String [Tokens.length] ;
String startReg= "<.*?>";
String endReg = "</.*?>";
String curr_tag = "O";
int start=0;
//ArrayList<String> newTokens = new ArrayList<String>();
int i=0;
for (String w: Tokens){
if (w.matches(startReg) && !w.matches(endReg)){
curr_tag=w.replace("<","").replace(">", "");
continue;
}
if (w.matches(endReg)){
curr_tag="O";
start=0;
continue;
}
if(w.equals("<") || w.equals(">") || w.equals("\\") || w.equals("<\\") ){
System.err.println(w);
}
newTokens.add(w);
if(curr_tag.equals("O"))
start=0;
else{
if (start==0)
curr_tag = "B-"+curr_tag ;
else if (start==1)
curr_tag = curr_tag.replace("B-", "I-") ;
start++;
}
Tags[i] = curr_tag;
i++;
}
assert (newTokens.size()==Tokens.length);
return Tags;
}
/**
* CONVERT TO TYPE Naively decide the tweet token type, url, or hashtag,
* or metion, or number. Or it's not any of them, just return it's
* original string.
*
* @param token
* @return
*/
public static String tokentype(String token) {
// lower cased word.
String ltoken = StringUtil.getDeAccentLoweredString(token.trim());
if (ltoken.startsWith("http:") || ltoken.startsWith("www:")) {
ltoken = "[http]";
} else if (ltoken.startsWith("@") || ltoken.startsWith("#")) {
if (ltoken.length() > 1) {
ltoken = ltoken.substring(1);
}
}
try {
Double.parseDouble(ltoken);
ltoken = "[num]";
} catch (NumberFormatException e) {
}
return ltoken;
}
}