--- ECHO_content/ECHO_language.py 2004/11/24 15:17:41 1.1
+++ ECHO_content/ECHO_language.py 2004/11/27 22:38:44 1.2
@@ -1,4 +1,5 @@
"""Methoden fuer Language Technologies"""
+
def donatus(txt2):
import xmlrpclib
@@ -30,3 +31,123 @@ def donatusVariant2Lemma(morphData):
ret[atr]=[lemma.getAttribute('form')]
return ret
+
+class ECHO_language:
+ """language methods"""
+
+ def donatusVariant2Lemma(self,nr='1'):
+ """analyze by donatus"""
+ return donatusVariant2Lemma(donatus(self.lemmatize(nr)))
+
+ def tagLex(self,nr="1"):
+ """generate Links"""
+ global retLex
+ global toggle
+
+ toggle=0
+ retLex=""
+
+ lemmatized=self.lemmatize(nr)[0:]
+ #print "ho",repr(lemmatized)
+ variants=donatusVariant2Lemma(donatus(lemmatized))
+
+ def createTag(name,attrs):
+ global toggle
+
+ if name=="w":
+ toggle=1
+ return ""
+ else:
+ tag="<"
+ tag+=name
+ for attr in attrs.keys():
+ tag+=""" %s="%s" """%(attr,attrs[attr])
+ tag+=">"
+ return tag
+
+ def createData(data):
+ global toggle
+ astring="""%s """
+ if toggle: # tag war ein w
+ toggle=0
+ if variants.has_key(data):
+ return astring%(variants[data][0],data)
+ else:
+ return astring%(data,data)
+
+
+
+ # 3 handler functions
+ def start_element(name, attrs):
+ global retLex
+
+ retLex+=createTag(name,attrs)
+ def end_element(name):
+ global retLex
+ if not name=="w":
+ retLex+="%s>"%(name.encode('utf-8'))
+
+
+ def char_data(data):
+ global retLex
+ if data:
+ try:
+ retLex+=createData(data)
+ except:
+ """no"""
+
+ p = xml.parsers.expat.ParserCreate()
+
+ p.StartElementHandler = start_element
+ p.EndElementHandler = end_element
+ p.CharacterDataHandler = char_data
+
+ p.Parse(lemmatized.encode('utf-8'),1)
+ #print repr(lemmatized.encode('utf-8'))
+
+ return retLex
+
+
+ def lemmatize(self,nr='1',lang="de"):
+ """lemmatize"""
+ global ret
+ ret=""
+
+ def createTag(name,attrs):
+ tag="<"
+ tag+=name
+ for attr in attrs.keys():
+ tag+=""" %s="%s" """%(attr,attrs[attr])
+ tag+=">"
+ return tag
+
+ def insertW(str):
+ splitted=str.split()
+ wordlist=["%s"%split for split in splitted]
+ return string.join(wordlist,'\n')
+
+ # 3 handler functions
+ def start_element(name, attrs):
+ global ret
+ ret+=createTag(name,attrs)
+ def end_element(name):
+ global ret
+ ret+="%s>"%(name.encode('utf-8'))
+
+ def char_data(data):
+ global ret
+ ret+=insertW(data)
+
+ p = xml.parsers.expat.ParserCreate()
+
+ p.StartElementHandler = start_element
+ p.EndElementHandler = end_element
+ p.CharacterDataHandler = char_data
+
+ p.Parse(self.getPage(nr), 1)
+ txt="""
+
+ """
+ ret=txt%(lang,ret)
+
+ return ret