--- ECHO_content/ECHO_language.py 2004/11/24 15:17:41 1.1
+++ ECHO_content/ECHO_language.py 2005/10/26 08:35:53 1.3
@@ -1,32 +1,153 @@
"""Methoden fuer Language Technologies"""
+
def donatus(txt2):
- import xmlrpclib
+ import xmlrpclib
- server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
+ server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
- txt=txt2.encode('utf-8')
- bin=xmlrpclib.Binary(txt)
+ txt=txt2.encode('utf-8')
+ bin=xmlrpclib.Binary(txt)
-
+
- ret=server.donatus.analyze(bin)
+ ret=server.donatus.analyze(bin)
-
- return ret['morphData'].data
+
+ return ret['morphData'].data
def donatusVariant2Lemma(morphData):
- """creates hash variant -> morphdata"""
- ret={}
- dom=xml.dom.minidom.parseString(morphData)
- lemmas=dom.getElementsByTagName('lemma')
- for lemma in lemmas:
- variants=lemma.getElementsByTagName('variant')
- for variant in variants:
- atr=variant.getAttribute('form')
- if ret.has_key(atr):
- ret[atr].append=lemma.getAttribute('form')
- else:
- ret[atr]=[lemma.getAttribute('form')]
-
- return ret
+ """creates hash variant -> morphdata"""
+ ret={}
+ dom=xml.dom.minidom.parseString(morphData)
+ lemmas=dom.getElementsByTagName('lemma')
+ for lemma in lemmas:
+ variants=lemma.getElementsByTagName('variant')
+ for variant in variants:
+ atr=variant.getAttribute('form')
+ if ret.has_key(atr):
+ ret[atr].append=lemma.getAttribute('form')
+ else:
+ ret[atr]=[lemma.getAttribute('form')]
+
+ return ret
+
+class ECHO_language:
+ """language methods"""
+
+ def donatusVariant2Lemma(self,nr='1'):
+ """analyze by donatus"""
+ return donatusVariant2Lemma(donatus(self.lemmatize(nr)))
+
+ def tagLex(self,nr="1"):
+ """generate Links"""
+ global retLex
+ global toggle
+
+ toggle=0
+ retLex=""
+
+ lemmatized=self.lemmatize(nr)[0:]
+ #print "ho",repr(lemmatized)
+ variants=donatusVariant2Lemma(donatus(lemmatized))
+
+ def createTag(name,attrs):
+ global toggle
+
+ if name=="w":
+ toggle=1
+ return ""
+ else:
+ tag="<"
+ tag+=name
+ for attr in attrs.keys():
+ tag+=""" %s="%s" """%(attr,attrs[attr])
+ tag+=">"
+ return tag
+
+ def createData(data):
+ global toggle
+ astring="""%s """
+ if toggle: # tag war ein w
+ toggle=0
+ if variants.has_key(data):
+ return astring%(variants[data][0],data)
+ else:
+ return astring%(data,data)
+
+
+
+ # 3 handler functions
+ def start_element(name, attrs):
+ global retLex
+
+ retLex+=createTag(name,attrs)
+ def end_element(name):
+ global retLex
+ if not name=="w":
+ retLex+="%s>"%(name.encode('utf-8'))
+
+
+ def char_data(data):
+ global retLex
+ if data:
+ try:
+ retLex+=createData(data)
+ except:
+ """no"""
+
+ p = xml.parsers.expat.ParserCreate()
+
+ p.StartElementHandler = start_element
+ p.EndElementHandler = end_element
+ p.CharacterDataHandler = char_data
+
+ p.Parse(lemmatized.encode('utf-8'),1)
+ #print repr(lemmatized.encode('utf-8'))
+
+ return retLex
+
+
+ def lemmatize(self,nr='1',lang="de"):
+ """lemmatize"""
+ global ret
+ ret=""
+
+ def createTag(name,attrs):
+ tag="<"
+ tag+=name
+ for attr in attrs.keys():
+ tag+=""" %s="%s" """%(attr,attrs[attr])
+ tag+=">"
+ return tag
+
+ def insertW(str):
+ splitted=str.split()
+ wordlist=["%s"%split for split in splitted]
+ return string.join(wordlist,'\n')
+
+ # 3 handler functions
+ def start_element(name, attrs):
+ global ret
+ ret+=createTag(name,attrs)
+ def end_element(name):
+ global ret
+ ret+="%s>"%(name.encode('utf-8'))
+
+ def char_data(data):
+ global ret
+ ret+=insertW(data)
+
+ p = xml.parsers.expat.ParserCreate()
+
+ p.StartElementHandler = start_element
+ p.EndElementHandler = end_element
+ p.CharacterDataHandler = char_data
+
+ p.Parse(self.getPage(nr), 1)
+ txt="""
+
+ """
+ ret=txt%(lang,ret)
+
+ return ret