"""Methoden fuer Language Technologies"""
def donatus(txt2):
import xmlrpclib
server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
txt=txt2.encode('utf-8')
bin=xmlrpclib.Binary(txt)
ret=server.donatus.analyze(bin)
return ret['morphData'].data
def donatusVariant2Lemma(morphData):
"""creates hash variant -> morphdata"""
ret={}
dom=xml.dom.minidom.parseString(morphData)
lemmas=dom.getElementsByTagName('lemma')
for lemma in lemmas:
variants=lemma.getElementsByTagName('variant')
for variant in variants:
atr=variant.getAttribute('form')
if ret.has_key(atr):
ret[atr].append=lemma.getAttribute('form')
else:
ret[atr]=[lemma.getAttribute('form')]
return ret
class ECHO_language:
"""language methods"""
def donatusVariant2Lemma(self,nr='1'):
"""analyze by donatus"""
return donatusVariant2Lemma(donatus(self.lemmatize(nr)))
def tagLex(self,nr="1"):
"""generate Links"""
global retLex
global toggle
toggle=0
retLex=""
lemmatized=self.lemmatize(nr)[0:]
#print "ho",repr(lemmatized)
variants=donatusVariant2Lemma(donatus(lemmatized))
def createTag(name,attrs):
global toggle
if name=="w":
toggle=1
return ""
else:
tag="<"
tag+=name
for attr in attrs.keys():
tag+=""" %s="%s" """%(attr,attrs[attr])
tag+=">"
return tag
def createData(data):
global toggle
astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
if toggle: # tag war ein w
toggle=0
if variants.has_key(data):
return astring%(variants[data][0],data)
else:
return astring%(data,data)
# 3 handler functions
def start_element(name, attrs):
global retLex
retLex+=createTag(name,attrs)
def end_element(name):
global retLex
if not name=="w":
retLex+="</%s>"%(name.encode('utf-8'))
def char_data(data):
global retLex
if data:
try:
retLex+=createData(data)
except:
"""no"""
p = xml.parsers.expat.ParserCreate()
p.StartElementHandler = start_element
p.EndElementHandler = end_element
p.CharacterDataHandler = char_data
p.Parse(lemmatized.encode('utf-8'),1)
#print repr(lemmatized.encode('utf-8'))
return retLex
def lemmatize(self,nr='1',lang="de"):
"""lemmatize"""
global ret
ret=""
def createTag(name,attrs):
tag="<"
tag+=name
for attr in attrs.keys():
tag+=""" %s="%s" """%(attr,attrs[attr])
tag+=">"
return tag
def insertW(str):
splitted=str.split()
wordlist=["<w>%s</w>"%split for split in splitted]
return string.join(wordlist,'\n')
# 3 handler functions
def start_element(name, attrs):
global ret
ret+=createTag(name,attrs)
def end_element(name):
global ret
ret+="</%s>"%(name.encode('utf-8'))
def char_data(data):
global ret
ret+=insertW(data)
p = xml.parsers.expat.ParserCreate()
p.StartElementHandler = start_element
p.EndElementHandler = end_element
p.CharacterDataHandler = char_data
p.Parse(self.getPage(nr), 1)
txt="""<wtag locator="xxx">
<section lang="%s"><s>%s</s></section>
</wtag>"""
ret=txt%(lang,ret)
return ret
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>