import sys import os from subprocess import Popen, PIPE import re from twisted.plugin import IPlugin from tmat.tokenizer import TokenizerPlugin from tmat.myparser import ParserPlugin from zope.interface import Interface, Attribute, implements from xml.dom.minidom import getDOMImplementation, parseString from codecs import lookup from qt import * from sets import Set class XeroxTokenizer(object): implements(IPlugin, TokenizerPlugin) running = False displayName = "Local Xerox Tokenizer" preferences = {"Path to tokenize":"string", "Finite State Machine":"string", "Tokenize Flags":"string", "Get Version Flags":"string"} values = {} pipe=None def setValue(self,key, val): self.values[key]=val def do(self, command, returnPipe=False): if command: cmd="%s\n" % command self.pipe=Popen(cmd,shell=True,bufsize=0,stdin=PIPE,stdout=PIPE,stderr=PIPE) if returnPipe: return self.pipe else: return (self.pipe is not None) else: return -1 def initialize(self): pass def uninitialize(self): if self.pipe and self.pipe.poll(): self.pipe.stdin.close() try: pid=self.pipe.pid os.kill(pid,9) except: return -1 return 0 def makeCommand(self, flags): command=" %s %s" % (self.values["Path to tokenize"],self.values["Finite State Machine"]) args = [command,] + flags.split(" ") return " ".join(args) def tokenize(self, text,enc="utf-8"): Encode,Decode,Reader,Writer=lookup(enc) retText, retErr = self.communicate("Tokenize", Encode(text)[0]) retText=Decode(retText)[0] retList=[] i=0 for token in retText.splitlines(): if not(token.strip()): continue j=text.find(token,i) if j <> i: retList.append((i,j,False)) i=j retList.append((i,i+len(token),True)) i += len(token) if i <> len(text): retList.append((i, len(text),False)) return retList def communicate(self, flagName, data=None): self.do(self.makeCommand(self.values["%s Flags" % flagName])) self.start() if data: retText,retErr=self.pipe.communicate(data.replace("\n"," ")) else: retText=retErr="" self.stop() self.uninitialize() return (retText,retErr) def start(self): self.running=True def stop(self): self.running=False def version(self): return self.communicate("Get Version")[0] class XeroxParser(object): implements(IPlugin, ParserPlugin) running = False displayName = "Local Xerox Parser" preferences = {"Finite State Machine":"string", "Path to xfst":"string", "Parse Flag(s)":"string", "Get Version Flag(s)":"string"} values = {} mem={} encs={} progressBar=None def setValue(self,key, val): self.values[key]=val def initialize(self): return 0 def _initialize(self,f=""): command=self.values["Path to xfst"] self.args=[command,] + f.split(" ") def setArgs(self, c): self.args=c def addCommand(self, command): self.args.append(" -e \"%s\"" % command.strip()) def uninitialize(self): return 0 def start(self): return 0 def stop(self): return 0 def _run(self, enc="utf-8"): if enc not in self.encs: self.encs[enc]=lookup(enc) Encode,Decode,Reader,Writer=self.encs[enc] self.args.append(" -stop") p=Popen(Encode(" ".join(self.args))[0],shell=True,bufsize=0,stdin=PIPE,stdout=PIPE,stderr=PIPE) self.args=[] return Reader(p.stdout).read() def version(self): self._initialize(self.values["Version Flag(s)"]) return self._run().strip() def parse(self, words, progressBar=None): self.progressBar=progressBar return self._parse(self.values["Finite State Machine"], words) def rememberArgs(self, name): self.mem[name]=[x for x in self.args] def restoreArgs(self, name): if name in self.mem: self.setArgs([x for x in self.mem[name]]) else: self.setArgs("") def _parse(self, fsm, wordforms): retDict={} self.mem={} self._initialize(self.values["Parse Flag(s)"]) self.addCommand("loadd %s" % fsm) self.addCommand("regex URSR;") self.rememberArgs("ursr") self._initialize(self.values["Parse Flag(s)"]) self.addCommand("loadd %s" % fsm) self.addCommand("regex GlUR;") self.rememberArgs("glur") wordsDone=0 progressLabel="Parsed %s of " + "%s tokens." % len(wordforms) if self.progressBar: self.progressBar.setProgress(0) for word in wordforms: retDict[word]=[] self.restoreArgs("ursr") self.addCommand("apply up %s" % word) underlyingForms=self._run().splitlines() for uf in [x.strip() for x in underlyingForms if x.strip()]: self.restoreArgs("glur") self.addCommand("apply up %s" % uf) mfs=self._run().splitlines() for mf in [x.strip() for x in mfs if x.strip()]: retDict[word].append({"morpheme":uf,"gloss":mf}) wordsDone += 1 if self.progressBar: self.progressBar.setLabelText(progressLabel % wordsDone) self.progressBar.setProgress(wordsDone) qApp.processEvents(); if (self.progressBar.wasCanceled()): return retDict; if self.progressBar: self.progressBar.reset() self.progressBar=None return retDict # def _parse(self, fsm, wordforms): # retDict={} # self.mem={} # self._initialize(self.values["Parse Flag(s)"]) # self.addCommand("loadd %s" % fsm) # self.addCommand("regex URSR;") # self.rememberArgs("ursr") # self._initialize(self.values["Parse Flag(s)"]) # self.addCommand("loadd %s" % fsm) # self.addCommand("regex GlUR;") # self.rememberArgs("glur") # for word in [x.strip() for x in wordforms if x.strip()]: # print "Parsing word..." # retDict[word]=[] # self.restoreArgs("ursr") # self.addCommand("apply up %s" % word) # underlyingForms=self._run().splitlines() # for uf in [x.strip() for x in underlyingForms if x.strip()]: # self.restoreArgs("glur") # self.addCommand("apply up %s" % uf) # mfs=self._run().splitlines() # for mf in [x.strip() for x in mfs if x.strip()]: # retDict[word].append({"morpheme":uf,"gloss":mf}) # print "...done" # return retDict x = XeroxTokenizer() y = XeroxParser()