import os.path
import os
import xmlrpclib
import xml.dom.minidom
import urllib
from Ft.Xml.Xslt.Processor import Processor
from Ft.Xml.InputSource import DefaultFactory
from Ft.Lib import Uri
def package_home(gdict):
filename = gdict["__file__"]
return os.path.dirname(filename)
def getTextFromNode(nodename):
nodelist=nodename.childNodes
rc = ""
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc = rc + node.data
return rc
class DonatusFile:
def __init__(self,fileName=None,url=None,txt=None,baseUri=None):
'''
@param fileName:path to the filename
@url fals url
'''
if fileName:
self.fileName=fileName
self.file_uri= Uri.OsPathToUri(fileName, attemptAbsolute=1)
elif url:
self.filename=self.file_uri=url
elif txt:
self.fileName="txt"
self.file_uri=None
self.txt=txt
else:
return None
self.baseUri=baseUri
def generateWordList(self):
'''
generate wordList (wtag format for donatus)
'''
if not hasattr(self,"wordList"):
xsltproc = Processor()
xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'wordlist.xsl'), attemptAbsolute=1)
xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))
if self.file_uri:
self.wordList = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
else:
self.wordList = xsltproc.run(DefaultFactory.fromString(self.txt))[0:]
return self.wordList
def analyseWordList(self):
'''
wordList nach donatus
'''
try:
prssafsaf
if not hasattr(self,'analysedWordList'):
server=xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
bin=xmlrpclib.Binary(getattr(self,"wordList",self.generateWordList()))
ret=server.donatus.analyze(bin)
self.analysedWordList=ret['morphData'].data[0:]
return self.analysedWordList
except:
print "ERROR: cannot analyse words"
self.analyseWordList="""<?xml version="1.0"?><ERROR>cannot analyse wordlist</ERROR>"""
return self.analyseWordList
def wordListToHash(self):
'''
wordList to hash
'''
if not hasattr(self,'words'):
self.words={}
dom=xml.dom.minidom.parseString(getattr(self,'analysedWordist',self.analyseWordList()))
lemmas=dom.getElementsByTagName('lemma')
for lemma in lemmas:
form=lemma.getAttribute('form')
variants=lemma.getElementsByTagName('variant')
for variant in variants:
formV=variant.getAttribute('form')
if self.words.has_key(formV) and not (form in self.words[formV]):
self.words[formV].append(form)
else:
self.words[formV]=[form]
return self.words
def lemmatizeFile(self):
'''
lemmatize file
'''
if not hasattr(self,'lemmatizedFile'):
xsltproc = Processor()
xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'lemmatize.xsl'), attemptAbsolute=1)
xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))
if getattr(self,'file_uri',None):
lemmatized = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
else:
lemmatized = xsltproc.run(DefaultFactory.fromString(self.txt,self.baseUri))[0:]
self.lemmatizedFile=lemmatized
return self.lemmatizedFile
def addFormToWords(self):
'''
add form attributes to the words
'''
if not hasattr(self,'dom_with_attributes'):
dom=xml.dom.minidom.parseString(getattr(self,'lemmatizedFile',self.lemmatizeFile()))
wordNodes=dom.getElementsByTagName('mpiwg:w')
#words=getattr(self,'words',self.wordListToHash())
words=self.wordListToHash()
for word in wordNodes:
text=getTextFromNode(word)
text=text.lstrip().rstrip()
if (len(text)>0) and ('.!();?[],'.find(text[-1])>-1):
textTmp=text[0:len(text)-1]
else:
textTmp=text
if words.has_key(textTmp):
form=words[textTmp][0]
word.setAttribute("mpiwg:form",form)
word.setAttribute("mpiwg:analysed","yes")
else:
if (textTmp!="") and (textTmp !=" "):
word.setAttribute("mpiwg:form",textTmp)
word.setAttribute("mpiwg:analysed","no")
self.dom_with_attributes=dom
return self.dom_with_attributes
def convertedXML(self):
dom=getattr(self,'dom_with_attributes',self.addFormToWords())
return dom.toxml('utf-8')
def wordsToLinks(self):
xmlTxt=self.convertedXML()
global retLex
global toggle
toggle=0
retLex=""
saved_attrs={}
def createTag(name,attrs):
global toggle
global saved_attrs
if name=="mpiwg:w":
toggle=1
saved_attrs=attrs
return ""
else:
tag="<"
tag+=name
for attr in attrs.keys():
tag+=""" %s="%s" """%(attr,attrs[attr])
tag+=">"
return tag
def createData(data):
global toggle
global saved_attrs
print saved_attrs
astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
urlString="""http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de"""
if toggle: # tag war ein w
toggle=0
if saved_attrs.has_key('mpiwg:form'):
if saved_attrs['mpiwg:analysed']=='yes':
return astring%(saved_attrs['mpiwg:form'],data)
else:
return "<a>"+data+"</a>"
else:
return data
# 3 handler functions
def start_element(name, attrs):
global retLex
retLex+=createTag(name,attrs)
def end_element(name):
global retLex
if not name=="mpiwg:w":
retLex+="</%s>"%(name.encode('utf-8'))
def char_data(data):
global retLex
retLex+=createData(data)
if data:
try:
retLex+=createData(data)
except:
"""no"""
p = xml.parsers.expat.ParserCreate()
p.StartElementHandler = start_element
p.EndElementHandler = end_element
p.CharacterDataHandler = char_data
p.Parse(xmlTxt,1)
#print repr(lemmatized.encode('utf-8'))
return retLex
#def convertFile(source,target):
# '''
# @param source:source directory tree
# @param target: target directory tree
# '''
#
# if not os.path.exists(target):
# os.mkdir(target)
# for root,dirs,files in os.walk(source):
#
# for dir in dirs:
#
# dirName=os.path.join(root,dir).replace(source,target)
# if not os.path.exists(dirName):
# os.mkdir(dirName)
#
# for name in files:
# fileName=os.path.join(root,name)
#
# if os.path.splitext(fileName)[1]==".xml":
# fileNameNeu=fileName.replace(source,target)
# print "processing",fileNameNeu
# fh=file(fileNameNeu,"w")
# try:
# fh.write(donatusFile(fileName).convertedXML())
# except:
# print "ERROR:",fileName
# fh.close()
#
#rootDir="/Users/dwinter/Diss/Quellen-primaer/Formax/Done"
#rootDirNeu="/Users/dwinter/Diss/Quellen-primaer/transformed0.1"
#
#convertFile(rootDir,rootDirNeu)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>