/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** @author Wei Li <a href="mailto:weili@cs.umass.edu">weili@cs.umass.edu</a> */ package cc.mallet.share.weili.ner.enron; import java.util.regex.*; import java.util.*; import java.io.*; import cc.mallet.pipe.*; import cc.mallet.share.weili.ner.*; import cc.mallet.types.*; public class EnronMessage2TokenSequence extends Pipe implements Serializable { boolean saveSource = false; public static String[] skip = new String[] {"=_part_", "sent by:"}; public static String[] skipToBlankLine = new String[] {"subject:", "original message", "content-type:", "content-transfer-encoding:", "forwarded by", "from:", "sent:", "to:", "bcc:", "cc:"}; public static String[] labels = new String[] {"DATE", "TIME", "LOCATION", "PERSON", "ORGANIZATION", "ACRONYM", "PHONE", "MONEY", "PERCENT"}; HashSet headerPersonNames; public EnronMessage2TokenSequence () { super (null, new LabelAlphabet()); headerPersonNames = new HashSet(); } public Instance pipe (Instance carrier) { TokenSequence data = new TokenSequence (); LabelSequence target = new LabelSequence ((LabelAlphabet)getTargetAlphabet()); StringBuffer source = saveSource ? new StringBuffer() : null; WordTransformation wt = new WordTransformation(); File f = (File) carrier.getData(); StringBuffer message = new StringBuffer(); try { BufferedReader br = new BufferedReader(new FileReader(f)); //skip the header before the first blank line String line = br.readLine(); while (line != null) { if (line.equals("")) break; int i; line = line.toLowerCase(); for (i = 5; i <= 9; i++) { if (line.startsWith(skipToBlankLine[i])) break; } if (i <= 9) { String header = line.substring(skipToBlankLine[i].length()); while ((line = br.readLine()) != null) { if (line.equals("")) break; if (line.startsWith(" ") || line.startsWith("\t")) header += line; else break; } StringTokenizer st = new StringTokenizer(header, " \t,"); while (st.hasMoreTokens()) { String token = st.nextToken(); if (!token.endsWith("@enron.com")) { continue; } token = token.substring(0, token.length()-10); int dot = token.indexOf("."); if (dot == -1) { continue; } if (dot != token.lastIndexOf(".")) { if (dot == token.lastIndexOf(".")-1) { dot++; if (dot+1 < token.length()-1) headerPersonNames.add(token.substring(dot+1)); } continue; } if (dot > 1) headerPersonNames.add(token.substring(0, dot)); if (dot+1 < token.length()-1) headerPersonNames.add(token.substring(dot+1)); } } else line = br.readLine(); } while ((line = br.readLine()) != null) { boolean header = false; for (int i = 0; i < skip.length; i++) { int index = line.toLowerCase().indexOf(skip[i]); if (index != -1) { String prefix = line.substring(0, index).trim(); header = true; for (int j = 0; j < prefix.length(); j++) { if (prefix.charAt(j) != '-' && prefix.charAt(j) != '>' && prefix.charAt(j) != ' ') { header = false; break; } } if (header) break; } } if (header) continue; for (int i = 0; i < skipToBlankLine.length; i++) { int index = line.toLowerCase().indexOf(skipToBlankLine[i]); if (index != -1) { String prefix = line.substring(0, index).trim(); header = true; for (int j = 0; j < prefix.length(); j++) { if (prefix.charAt(j) != '-' && prefix.charAt(j) != '>' && prefix.charAt(j) != ' ') { header = false; break; } } if (header) break; } } if (header) { while ((line = br.readLine()) != null) { if (line.equals("")) break; } continue; } message.append(line); message.append("\n"); } } catch (IOException e) {System.err.println(e);} String currentLabel = "O"; StringTokenizer st = new StringTokenizer(message.toString(), "<>", true); boolean readText = true; String text = null; while (st.hasMoreTokens()) { if (readText) text = st.nextToken(); readText = true; if (text.equals("<")) { String tag = st.nextToken(); if (tag.equals("/ENAMEX") || tag.equals("/TIMEX") || tag.equals("/NUMEX")) { String nextToken = st.nextToken(); assert (nextToken.equals(">")); currentLabel = "O"; continue; } else if (tag.startsWith("ENAMEX") || tag.startsWith("TIMEX") || tag.startsWith("NUMEX")) { String type = tag.substring(tag.indexOf(" ")+1); assert (type.startsWith("TYPE=")); type = type.substring(type.indexOf("\"")+1, type.lastIndexOf("\"")); // nested entities (should do something) //if (!currentLabel.equals("O")) { //} for (int i = 0; i < labels.length; i++) { if (labels[i].equals(type)) { currentLabel = "B-" + type; break; } } String nextToken = st.nextToken(); assert (nextToken.equals(">")); continue; } else {//false alarm data.add(new Token("<")); target.add(currentLabel); if (saveSource) { source.append ("<"); source.append ("\n"); } text = tag; readText = false; } } // there is no tag in "text" StringTokenizer wordst = new StringTokenizer(text, "~`!@#$%^&*()_-+={[}]|\\:;\"',<.>?/ \t\n\r", true); while (wordst.hasMoreTokens()) { String word = wordst.nextToken(); if (word.equals(" ") || word.equals("\t") || word.equals("\n") || word.equals("\r")) continue; String originalWord = word; Token token = wt.transformedToken (word); // Check if the token is in headerPersonNames if (headerPersonNames.contains(word.toLowerCase())) { token.setFeatureValue("HEADER-PERSON", 1.0); } // Append data.add (token); target.add (currentLabel); if (saveSource) { source.append (originalWord); source.append ("\n"); } if (currentLabel.startsWith("B-")) currentLabel = "I-" + currentLabel.substring(2); } } carrier.setData(data); carrier.setTarget(target); if (saveSource) carrier.setSource(source); return carrier; } public void write (File f) { try { ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(f)); oos.writeObject(headerPersonNames); oos.close(); } catch (IOException e) { System.err.println("Exception writing file " + f + ": " + e); } } // Serialization private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 0; private void writeObject (ObjectOutputStream out) throws IOException { out.writeInt(CURRENT_SERIAL_VERSION); out.writeBoolean(saveSource); out.writeObject(headerPersonNames); } private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { int version = in.readInt(); saveSource = in.readBoolean(); headerPersonNames = (HashSet)in.readObject(); } }