import os.path import os import xmlrpclib import xml.dom.minidom import urllib from Ft.Xml.Xslt.Processor import Processor from Ft.Xml.InputSource import DefaultFactory from Ft.Lib import Uri def package_home(gdict): filename = gdict["__file__"] return os.path.dirname(filename) def getTextFromNode(nodename): nodelist=nodename.childNodes rc = "" for node in nodelist: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc class DonatusFile: def __init__(self,fileName=None,url=None,txt=None,baseUri=None): ''' @param fileName:path to the filename @url fals url ''' if fileName: self.fileName=fileName self.file_uri= Uri.OsPathToUri(fileName, attemptAbsolute=1) elif url: self.filename=self.file_uri=url elif txt: self.fileName="txt" self.file_uri=None self.txt=txt else: return None self.baseUri=baseUri def generateWordList(self): ''' generate wordList (wtag format for donatus) ''' if not hasattr(self,"wordList"): xsltproc = Processor() xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'wordlist.xsl'), attemptAbsolute=1) xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri)) if self.file_uri: self.wordList = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:] else: self.wordList = xsltproc.run(DefaultFactory.fromString(self.txt))[0:] return self.wordList def analyseWordList(self): ''' wordList nach donatus ''' try: prssafsaf if not hasattr(self,'analysedWordList'): server=xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc") bin=xmlrpclib.Binary(getattr(self,"wordList",self.generateWordList())) ret=server.donatus.analyze(bin) self.analysedWordList=ret['morphData'].data[0:] return self.analysedWordList except: print "ERROR: cannot analyse words" self.analyseWordList="""cannot analyse wordlist""" return self.analyseWordList def wordListToHash(self): ''' wordList to hash ''' if not hasattr(self,'words'): self.words={} dom=xml.dom.minidom.parseString(getattr(self,'analysedWordist',self.analyseWordList())) lemmas=dom.getElementsByTagName('lemma') for lemma in lemmas: form=lemma.getAttribute('form') variants=lemma.getElementsByTagName('variant') for variant in variants: formV=variant.getAttribute('form') if self.words.has_key(formV) and not (form in self.words[formV]): self.words[formV].append(form) else: self.words[formV]=[form] return self.words def lemmatizeFile(self): ''' lemmatize file ''' if not hasattr(self,'lemmatizedFile'): xsltproc = Processor() xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'lemmatize.xsl'), attemptAbsolute=1) xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri)) if getattr(self,'file_uri',None): lemmatized = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:] else: lemmatized = xsltproc.run(DefaultFactory.fromString(self.txt,self.baseUri))[0:] self.lemmatizedFile=lemmatized return self.lemmatizedFile def addFormToWords(self): ''' add form attributes to the words ''' if not hasattr(self,'dom_with_attributes'): dom=xml.dom.minidom.parseString(getattr(self,'lemmatizedFile',self.lemmatizeFile())) wordNodes=dom.getElementsByTagName('mpiwg:w') #words=getattr(self,'words',self.wordListToHash()) words=self.wordListToHash() for word in wordNodes: text=getTextFromNode(word) text=text.lstrip().rstrip() if (len(text)>0) and ('.!();?[],'.find(text[-1])>-1): textTmp=text[0:len(text)-1] else: textTmp=text if words.has_key(textTmp): form=words[textTmp][0] word.setAttribute("mpiwg:form",form) word.setAttribute("mpiwg:analysed","yes") else: if (textTmp!="") and (textTmp !=" "): word.setAttribute("mpiwg:form",textTmp) word.setAttribute("mpiwg:analysed","no") self.dom_with_attributes=dom return self.dom_with_attributes def convertedXML(self): dom=getattr(self,'dom_with_attributes',self.addFormToWords()) return dom.toxml('utf-8') def wordsToLinks(self): xmlTxt=self.convertedXML() global retLex global toggle toggle=0 retLex="" saved_attrs={} def createTag(name,attrs): global toggle global saved_attrs if name=="mpiwg:w": toggle=1 saved_attrs=attrs return "" else: tag="<" tag+=name for attr in attrs.keys(): tag+=""" %s="%s" """%(attr,attrs[attr]) tag+=">" return tag def createData(data): global toggle global saved_attrs print saved_attrs astring="""%s """ urlString="""http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de""" if toggle: # tag war ein w toggle=0 if saved_attrs.has_key('mpiwg:form'): if saved_attrs['mpiwg:analysed']=='yes': return astring%(saved_attrs['mpiwg:form'],data) else: return ""+data+"" else: return data # 3 handler functions def start_element(name, attrs): global retLex retLex+=createTag(name,attrs) def end_element(name): global retLex if not name=="mpiwg:w": retLex+=""%(name.encode('utf-8')) def char_data(data): global retLex retLex+=createData(data) if data: try: retLex+=createData(data) except: """no""" p = xml.parsers.expat.ParserCreate() p.StartElementHandler = start_element p.EndElementHandler = end_element p.CharacterDataHandler = char_data p.Parse(xmlTxt,1) #print repr(lemmatized.encode('utf-8')) return retLex #def convertFile(source,target): # ''' # @param source:source directory tree # @param target: target directory tree # ''' # # if not os.path.exists(target): # os.mkdir(target) # for root,dirs,files in os.walk(source): # # for dir in dirs: # # dirName=os.path.join(root,dir).replace(source,target) # if not os.path.exists(dirName): # os.mkdir(dirName) # # for name in files: # fileName=os.path.join(root,name) # # if os.path.splitext(fileName)[1]==".xml": # fileNameNeu=fileName.replace(source,target) # print "processing",fileNameNeu # fh=file(fileNameNeu,"w") # try: # fh.write(donatusFile(fileName).convertedXML()) # except: # print "ERROR:",fileName # fh.close() # #rootDir="/Users/dwinter/Diss/Quellen-primaer/Formax/Done" #rootDirNeu="/Users/dwinter/Diss/Quellen-primaer/transformed0.1" # #convertFile(rootDir,rootDirNeu)