import edu.cmu.minorthird.text.*;
import edu.cmu.minorthird.text.mixup.MixupProgram;
import montylingua.JMontyLingua;
import java.io.*;
import java.net.ServerSocket;
import java.net.Socket;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class MinorTagger {
private static JMontyLingua montyLingua;
private static final File NAMEMIXUP = new File("names.mixup");
private static final File DATEMIXUP = new File("date.mixup");
private static final boolean DEBUG = false;
private static final int DEFAULT_PORT = 9998;
private static String fileName = "TaggedFile";
protected static void setFileName (String name) {
fileName = name;
}
public static void main (String args[]) {
ServerSocket echoServer = null;
Socket clientSocket = null;
int port = DEFAULT_PORT;
System.out.println("Starting MinorTagger v0.01b...");
if (args.length > 0 && args[0].matches("\\d+"))
port = Integer.parseInt(args[0]);
else {
System.out.println("WARN: No listening port specified, using default port!");
System.out.println("WARN: To specify, use the port number as the first argument.");
}
System.out.println("Loading Part-of-Speech Tagger...");
montyLingua = new JMontyLingua();
System.out.println("MinorTagger Started Successfully!");
System.out.println("Waiting for connection on port " + port + "...");
try {
echoServer = new ServerSocket(port);
while (true) {
clientSocket = echoServer.accept();
System.out.println("[" + (new Date()) + "] Connected from " + clientSocket.getRemoteSocketAddress());
(new MinorTaggerThread(clientSocket)).start();
}
} catch (IOException e) {
System.out.println(e);
}
}
/* Outputs document marked up with sgml */
private static String tag (String in) throws Exception {
if (in.replaceAll("\\s+", "").length() == 0) return "";
String tagged = in;
// load text base
TextBaseLoader baseLoader = new TextBaseLoader();
File tempFile = createFile(tagged);
TextBase base = baseLoader.load(tempFile);
// get XML labels
MutableTextLabels labels = baseLoader.getLabels();
// evaluate mixup
MixupProgram p = new MixupProgram(NAMEMIXUP);
p.eval(labels, base);
p = new MixupProgram(DATEMIXUP);
p.eval(labels, base);
TextLabelsLoader labelsLoader = new TextLabelsLoader();
// Minorthird's version of marking up XML labels
tagged = labelsLoader.createXMLmarkup(tempFile.getName(), labels);
String keepXML[] = {"S", "NP", "Name", "extracted_date", "extracted_time"};
tagged = filterXML(tagged, keepXML);
// fix the problem so that end-tag doesn't stick to the next begin-tag.
tagged = tagged.replaceAll("(<[^<>]+>[^<>]+?)(\\s+)(</[^<>]+>)", "$1$3$2");
return tagged;
}
/* Outputs minorthird stand off labels */
private static String label (String in) throws Exception {
if (in.replaceAll("\\s+", "").length() == 0) return "";
String tagged = in;
// load text base
TextBaseLoader baseLoader = new TextBaseLoader();
File tempFile = createFile(tagged);
TextBase base = baseLoader.load(tempFile);
tempFile.delete();
// get XML labels
MutableTextLabels labels = baseLoader.getLabels();
// evaluate mixup
MixupProgram p = new MixupProgram(NAMEMIXUP);
p.eval(labels, base);
p = new MixupProgram(DATEMIXUP);
p.eval(labels, base);
TextLabelsLoader labelsLoader = new TextLabelsLoader();
tagged = labelsLoader.printTypesAsOps(labels);
return tagged;
}
private static String filterXML (String tagged, String[] keepXML) {
Matcher m = Pattern.compile("</?([^<>]+)>").matcher(tagged);
HashMap delXML = new HashMap();
while (m.find()) {
if (delXML.containsKey(m.group(1))) continue;
boolean keep = false;
for (int i = 0; i < keepXML.length; i++) {
if (keepXML[i].equals(m.group(1))) {
keep = true;
break;
}
}
if (!keep) delXML.put(m.group(1), null);
}
for (Iterator i = delXML.keySet().iterator(); i.hasNext();) {
String del = (String) i.next();
tagged = tagged.replaceAll("</?\\Q" + del + "\\E>", "");
}
return tagged;
}
// create a temp file from a string
private static File createFile (String content) {
File temp = null;
BufferedWriter bWriter;
try {
temp = new File(fileName);
//temp = File.createTempFile(fileName, "");
temp.deleteOnExit();
bWriter = new BufferedWriter(new FileWriter(temp));
bWriter.write(content);
bWriter.close();
} catch (IOException ioe) {
System.err.println("Error creating temp file: " + ioe);
}
return temp;
}
private static class MinorTaggerThread extends Thread {
private Socket socket = null;
public MinorTaggerThread (Socket socket) {
super("MinorTaggerThread");
this.socket = socket;
}
public void run () {
StringBuffer buf = new StringBuffer();
BufferedReader br;
PrintStream os;
String line, fileName;
boolean label = false; //determines whether to output labels or sgml
try {
br = new BufferedReader(new InputStreamReader(socket.getInputStream()));
os = new PrintStream(socket.getOutputStream());
while ((line = br.readLine()) != null) {
boolean end_close = false;
boolean end_continue = false;
if(line.equals("labels")) {
label = true;
} else if(line.startsWith("***")) {
setFileName(line.substring(3));
}else if (line.endsWith("$$$")) {
end_close = true;
line = line.substring(0, line.length() - 3);
buf.append(line + "\n");
} else if (line.endsWith("$$")) {
end_continue = true;
line = line.substring(0, line.length() - 2);
buf.append(line + "\n");
}
if (end_close || end_continue) {
if (label)
os.println(label(buf.toString()));
else os.println(tag(buf.toString()));
buf.setLength(0);
}
if (end_close) {
os.close();
br.close();
socket.close();
break;
}
}
System.out.println("[" + (new Date()) + "] Disconnected from " + socket.getRemoteSocketAddress());
} catch (Exception e) {
System.err.println(e);
}
}
}
}