--- ECHO_content/ECHO_language.py 2004/11/24 15:17:41 1.1 +++ ECHO_content/ECHO_language.py 2005/10/26 08:35:53 1.3 @@ -1,32 +1,153 @@ """Methoden fuer Language Technologies""" + def donatus(txt2): - import xmlrpclib + import xmlrpclib - server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc") + server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc") - txt=txt2.encode('utf-8') - bin=xmlrpclib.Binary(txt) + txt=txt2.encode('utf-8') + bin=xmlrpclib.Binary(txt) - + - ret=server.donatus.analyze(bin) + ret=server.donatus.analyze(bin) - - return ret['morphData'].data + + return ret['morphData'].data def donatusVariant2Lemma(morphData): - """creates hash variant -> morphdata""" - ret={} - dom=xml.dom.minidom.parseString(morphData) - lemmas=dom.getElementsByTagName('lemma') - for lemma in lemmas: - variants=lemma.getElementsByTagName('variant') - for variant in variants: - atr=variant.getAttribute('form') - if ret.has_key(atr): - ret[atr].append=lemma.getAttribute('form') - else: - ret[atr]=[lemma.getAttribute('form')] - - return ret + """creates hash variant -> morphdata""" + ret={} + dom=xml.dom.minidom.parseString(morphData) + lemmas=dom.getElementsByTagName('lemma') + for lemma in lemmas: + variants=lemma.getElementsByTagName('variant') + for variant in variants: + atr=variant.getAttribute('form') + if ret.has_key(atr): + ret[atr].append=lemma.getAttribute('form') + else: + ret[atr]=[lemma.getAttribute('form')] + + return ret + +class ECHO_language: + """language methods""" + + def donatusVariant2Lemma(self,nr='1'): + """analyze by donatus""" + return donatusVariant2Lemma(donatus(self.lemmatize(nr))) + + def tagLex(self,nr="1"): + """generate Links""" + global retLex + global toggle + + toggle=0 + retLex="" + + lemmatized=self.lemmatize(nr)[0:] + #print "ho",repr(lemmatized) + variants=donatusVariant2Lemma(donatus(lemmatized)) + + def createTag(name,attrs): + global toggle + + if name=="w": + toggle=1 + return "" + else: + tag="<" + tag+=name + for attr in attrs.keys(): + tag+=""" %s="%s" """%(attr,attrs[attr]) + tag+=">" + return tag + + def createData(data): + global toggle + astring="""%s """ + if toggle: # tag war ein w + toggle=0 + if variants.has_key(data): + return astring%(variants[data][0],data) + else: + return astring%(data,data) + + + + # 3 handler functions + def start_element(name, attrs): + global retLex + + retLex+=createTag(name,attrs) + def end_element(name): + global retLex + if not name=="w": + retLex+=""%(name.encode('utf-8')) + + + def char_data(data): + global retLex + if data: + try: + retLex+=createData(data) + except: + """no""" + + p = xml.parsers.expat.ParserCreate() + + p.StartElementHandler = start_element + p.EndElementHandler = end_element + p.CharacterDataHandler = char_data + + p.Parse(lemmatized.encode('utf-8'),1) + #print repr(lemmatized.encode('utf-8')) + + return retLex + + + def lemmatize(self,nr='1',lang="de"): + """lemmatize""" + global ret + ret="" + + def createTag(name,attrs): + tag="<" + tag+=name + for attr in attrs.keys(): + tag+=""" %s="%s" """%(attr,attrs[attr]) + tag+=">" + return tag + + def insertW(str): + splitted=str.split() + wordlist=["%s"%split for split in splitted] + return string.join(wordlist,'\n') + + # 3 handler functions + def start_element(name, attrs): + global ret + ret+=createTag(name,attrs) + def end_element(name): + global ret + ret+=""%(name.encode('utf-8')) + + def char_data(data): + global ret + ret+=insertW(data) + + p = xml.parsers.expat.ParserCreate() + + p.StartElementHandler = start_element + p.EndElementHandler = end_element + p.CharacterDataHandler = char_data + + p.Parse(self.getPage(nr), 1) + txt=""" +
%s
+
""" + ret=txt%(lang,ret) + + return ret